General Setup

Setup chunk

Setup reticulate

knitr::opts_chunk$set(fig.width = 8)
knitr::opts_knit$set(root.dir = normalizePath(".."))
knitr::opts_knit$get("root.dir")
[1] "/nas/groups/treutlein/USERS/tomasgomes/projects/liver_regen"

Load libraries

library(Seurat)
library(ggplot2)
library(ggridges)
library(dplyr)
library(destiny)
library(fdrtool)
library(gam)
library(gamclass)
library(dtw)
library(data.table)
library(clusterProfiler)
library(ReactomePA)

Function for cross-validation

CVgam = function (formula, data, nfold = 10, debug.level = 0, method = "glm.fit", 
    printit = TRUE, cvparts = NULL, gamma = 1, seed = 29){
  # modified from the `gamclass` package to use gam::gam
    if (is.null(cvparts)) {
        set.seed(seed)
        cvparts <- sample(1:nfold, nrow(data), replace = TRUE)
    }
    folds <- unique(cvparts)
    khat <- hat <- numeric(nrow(data))
    scale.gam <- summary(gam::gam(formula, data = data, method = method))$scale
    for (i in folds) {
        trainrows <- cvparts != i
        testrows <- cvparts == i
        elev.gam <- gam::gam(formula, data = data[trainrows, ], method = method, 
            gamma = gamma)
        hat[testrows] <- predict(elev.gam, newdata = data[testrows, 
            ], select = TRUE)
        res <- residuals(elev.gam)
    }
    y <- eval(formula[[2]], envir = as.data.frame(data))
    res <- y - hat
    cvscale <- sum(res^2)/length(res)
    prntvec <- c(GAMscale = scale.gam, `CV-mse-GAM ` = cvscale)
    if (printit) 
        print(round(prntvec, 4))
    invisible(list(fitted = hat, resid = res, cvscale = cvscale, 
        scale.gam = scale.gam))
}

Load data

Load data (from all cells)

CVgam = function (formula, data, nfold = 10, debug.level = 0, method = "glm.fit", 
    printit = TRUE, cvparts = NULL, gamma = 1, seed = 29){
  # modified from the `gamclass` package to use gam::gam
    if (is.null(cvparts)) {
        set.seed(seed)
        cvparts <- sample(1:nfold, nrow(data), replace = TRUE)
    }
    folds <- unique(cvparts)
    khat <- hat <- numeric(nrow(data))
    scale.gam <- summary(gam::gam(formula, data = data, method = method))$scale
    for (i in folds) {
        trainrows <- cvparts != i
        testrows <- cvparts == i
        elev.gam <- gam::gam(formula, data = data[trainrows, ], method = method, 
            gamma = gamma)
        hat[testrows] <- predict(elev.gam, newdata = data[testrows, 
            ], select = TRUE)
        res <- residuals(elev.gam)
    }
    y <- eval(formula[[2]], envir = as.data.frame(data))
    res <- y - hat
    cvscale <- sum(res^2)/length(res)
    prntvec <- c(GAMscale = scale.gam, `CV-mse-GAM ` = cvscale)
    if (printit) 
        print(round(prntvec, 4))
    invisible(list(fitted = hat, resid = res, cvscale = cvscale, 
        scale.gam = scale.gam))
}

Hepatocytes

Subset and process each condition

allcells_css = readRDS(file = "data/processed/allcells_css.RDS")

Healthy analysis

For healthy cells (which will serve as reference), do additional filtering and reprocessing

DimPlot(hep_cells$healthy, reduction = "umap", group.by = "unique_name")
outcells = (hep_cells$healthy@reductions$umap@cell.embeddings[,1]>(1) &
              hep_cells$healthy@reductions$umap@cell.embeddings[,2]>5)
hep_cells$healthy = hep_cells$healthy[,!outcells]

hep_cells$healthy = suppressWarnings(SCTransform(hep_cells$healthy, do.correct.umi = T, 
                                                 verbose = F,
                                                 vars.to.regress = c("unique_name","nCount_RNA"),
                                                 variable.features.rv.th = 1, seed.use = 1,
                                                 return.only.var.genes = F, 
                                                 variable.features.n = NULL))
hep_cells$healthy = RunPCA(hep_cells$healthy, verbose = F)
hep_cells$healthy = RunUMAP(hep_cells$healthy, dims = 1:15, verbose = F)
DimPlot(hep_cells$healthy, reduction = "umap", group.by = "unique_name")

hep_cells$healthy = FindNeighbors(hep_cells$healthy, reduction = "pca", dims = 1:15,
                                  force.recalc = T, graph.name = "hep")
hep_cells$healthy = FindClusters(hep_cells$healthy, graph.name = "hep", algorithm = 4, 
                                 resolution = seq(0.1,0.8,0.1), verbose = F)
DimPlot(hep_cells$healthy, reduction = "umap", group.by = "hep_res.0.3")
hep_cells$healthy = SetIdent(hep_cells$healthy, value = "hep_res.0.6")
mk_hep_h = FindAllMarkers(hep_cells$healthy, assay = "SCT")

saveRDS(hep_cells$healthy, file = "results/zonation_healthy/hep_healthy_srat.RDS")

Save hepatocytes

saveRDS(hep_cells, file="results/zonation_cond/hep_cells.RDS")

Plot expression of zonation markers

gfresh = c("CYP1A2", "CYP2E1", "GLUL", "CYP3A4", "G0S2", "APOC1", "UGT2B17", "BCHE", # central 
           "SAA1", "HAMP", "SAA2", "CPS1", "C3", "C1R", "MT-ND4", "MT-CO2") # portal

png("results/zonation_healthy/healthy_hepatocyte_zonation_markers_violin.png", 
    height = 1400, width = 1400)
VlnPlot(hep_cells$healthy, features = gfresh, group.by = "names_clusters", 
        pt.size = 0, sort = "increasing")
dev.off()
png("results/zonation_healthy/healthy_hepatocyte_zonation_markers_umap.png", 
    height = 1400, width = 1400)
FeaturePlot(hep_cells$healthy, features = gfresh, pt.size = 0.6)
dev.off()

Obtain Diffusion maps projection

set.seed(1)
dm = DiffusionMap(hep_cells$healthy@reductions$pca@cell.embeddings[,1:4], 
                  rotate = T, n_eigs = 5)
dpt = DPT(dm)
plot(dpt)

Check expression of some genes

dpt_flat <- branch_divide(dpt, integer(0L))
root = min(dpt_flat@branch[, 1], na.rm = TRUE)
dpt_vals = destiny:::dpt_for_branch(dpt_flat, root)
df_dc = cbind(hep_cells$healthy@meta.data, 
              data.frame("DC1" = dpt@dm$DC1, "DC2" = dpt@dm$DC2, "DPT" = dpt_vals),
              Matrix::t(hep_cells$healthy@assays$SCT@data[gfresh,]))
colnames(df_dc) = gsub("-", ".", colnames(df_dc), fixed = T)

plt_list = list()
for(n in c(gsub("-", ".", gfresh, fixed = T), "names_clusters", "unique_name")){
  plt_list[[n]] = ggplot(df_dc, aes_string(x = "DC1", y = "DC2", colour = n))+
    geom_point()+
    theme_classic()
}

png("results/zonation_healthy/healthy_hepatocyte_zonation_markers_dc.png", 
    height = 900, width = 2800)
cowplot::plot_grid(plotlist = plt_list, nrow = 3, ncol = 6, align = "hv")
dev.off()

We’ll be using DC1 as the pseudotime. This was the best approximation possible to the zonation gradient based on diffusion maps

dpt_flat <- branch_divide(dpt, integer(0L))
root = min(dpt_flat@branch[, 1], na.rm = TRUE)
dpt_vals = destiny:::dpt_for_branch(dpt_flat, root)
df_dc = cbind(hep_cells$healthy@meta.data, 
              data.frame("DC1" = dpt@dm$DC1, "DC2" = dpt@dm$DC2, "DPT" = dpt_vals),
              Matrix::t(hep_cells$healthy@assays$SCT@data[gfresh,]))
colnames(df_dc) = gsub("-", ".", colnames(df_dc), fixed = T)

plt_list = list()
for(n in c(gsub("-", ".", gfresh, fixed = T), "names_clusters", "unique_name")){
  plt_list[[n]] = ggplot(df_dc, aes_string(x = "DC1", y = "DC2", colour = n))+
    geom_point()+
    theme_classic()
}

png("results/zonation_healthy/healthy_hepatocyte_zonation_markers_dc.png", 
    height = 900, width = 2800)
cowplot::plot_grid(plotlist = plt_list, nrow = 3, ncol = 6, align = "hv")
dev.off()
null device 
          1 

Find genes varying along the pseudotime

pdf("results/zonation_healthy/hep_distributions_dc1.pdf", useDingbats = F, 
    height = 4, width = 9)
plt1 = ggplot(df_dc, aes(x = DC1, y = names_clusters, 
                  group = names_clusters, fill = names_clusters))+
  geom_density_ridges2(alpha = 0.4)+
  theme_bw()+
  theme(legend.position = "none")

plt2 = ggplot(df_dc, aes(x = DC1, y = unique_name, 
                  group = unique_name, fill = unique_name))+
  geom_density_ridges2(alpha = 0.4)+
  theme_bw()+
  theme(legend.position = "none")

cowplot::plot_grid(plt1, plt2, ncol = 2, align = "h")
Picking joint bandwidth of 0.00354
Picking joint bandwidth of 0.00457
dev.off()
null device 
          1 

Analysis of other conditions

Now use the top genes to learn the pseudozonation coordinates and project the embolised and regenerating data

# will only use variable genes
hvg = hep_cells$healthy@assays$SCT@var.features

# Fit GAM for each gene using pseudotime as independent variable.
t = df_dc$DC1
t = scales::rescale(rank(df_dc$DC1), c(0.00001, 0.99999))
gene_fit_p = c()
gene_fit_vals = data.frame(row.names = 1:100)
for(i in 1:length(hvg)){
  g = hvg[i]
  z = hep_cells$healthy@assays$SCT@data[g,]
  
  d = data.frame(z=z, t=t)
  tmp = suppressMessages(gam(z ~ ns(t, df = 3), data=d))
  
  # bins for model fitting
  bb = seq(min(t), max(t), length.out = 100)
  gene_fit_vals[,g] = suppressMessages(predict(tmp,
                                               newdata = data.frame(t = bb)))
  p = summary(tmp)$parametric.anova$`Pr(>F)`[1]
  gene_fit_p = c(gene_fit_p, p)
  names(gene_fit_p)[length(gene_fit_p)] = g
}

gene_fit_p = fdrtool::fdrtool(gene_fit_p, statistic="pvalue", plot=F, 
                              verbose=F, cutoff.method="pct0", pct0=0.9)$qval

Calculate correlations

sig_genes = gene_fit_p[gene_fit_p<=0.05]
top_sig_genes = names(sig_genes[order(sig_genes, decreasing = F)][1:1000])
top_sig_genes = top_sig_genes[top_sig_genes %in% rownames(hep_cells$embolised@assays$SCT@data) &
                                top_sig_genes %in%  rownames(hep_cells$regenerating@assays$SCT@data)]

data_gam_h = cbind(data.frame("DC1" = df_dc$DC1),
                   Matrix::t(hep_cells$health@assays$SCT@data[top_sig_genes,]))
#data_gam_h$DC1 = scales::rescale(-data_gam_h$DC1, c(0.00001, 0.99999))
data_gam_h$DC1 = scales::rescale(rank(data_gam_h$DC1), c(0.00001, 0.99999))
#bins_dc1 = cut(data_gam_h$DC1, 200)
#cells_use = unlist(tapply(1:length(bins_dc1), bins_dc1, function(x) sample(x, 12)))

gam_healthy = gam::gam(DC1 ~ ., data = data_gam_h, family = mgcv::betar(link = "logit", eps = 0.00001))
#gam_healthy = gam::gam(DC1 ~ ., data = data_gam_h)

hea_pred = predict(gam_healthy,
                   Matrix::t(hep_cells$healthy@assays$SCT@data[top_sig_genes,]), type = "response")
reg_pred = predict(gam_healthy,
                   Matrix::t(hep_cells$regenerating@assays$SCT@data[top_sig_genes,]), type = "response")
emb_pred = predict(gam_healthy,
                   Matrix::t(hep_cells$embolised@assays$SCT@data[top_sig_genes,]), type = "response")

plot(density(reg_pred), col = "salmon", ylim = c(0,3.25))
lines(density(emb_pred), col = "darkred")
lines(density(data_gam_h$DC1), col = "orange")
lines(density(hea_pred), col = "grey50")
legend(-0.073,32, legend = c("healthy (DC1)", "healthy (pred)", 
                         "embolised (pred)", "regenerating (pred)"),
       fill = c("green", "grey50", "red", "blue"))


plot_df = data.frame(vals = c(data_gam_h$DC1, emb_pred, reg_pred), 
                     Condition = c(rep("healthy", length(data_gam_h$DC1)), 
                                   rep("embolised", length(emb_pred)), 
                                   rep("regenerating", length(reg_pred))),
                     Donor = c(hep_cells$healthy$Donor, 
                               hep_cells$embolised$Donor, hep_cells$regenerating$Donor))
plot_df$bins100 = cut(plot_df$vals, 10)
plot_df$Condition = factor(plot_df$Condition, levels = c("healthy", "embolised", "regenerating"))
ggplot(plot_df, aes(x = bins100, fill = Condition))+
     geom_bar(position = "fill")+
     labs(x = "zonation (binned)", y = "proportion")+
     scale_y_continuous(expand = c(0,0))+
     scale_fill_manual(values = colcond)+
     guides(fill = guide_legend(title.position = "top"))+
     theme(axis.text.x = element_blank(),
           legend.title.align = 0,
           legend.position = "bottom")

ggplot(plot_df, aes(x = bins100, fill = Condition))+
     facet_wrap(~Condition)+
     geom_bar()+
     labs(x = "zonation (binned)", y = "Number of cells")+
     scale_y_continuous(expand = c(0,0))+
     scale_fill_manual(values = colcond)+
     coord_flip()+
     guides(fill = guide_legend(title.position = "top"))+
     theme(axis.text.y = element_blank(),
           legend.title.align = 0,
           legend.position = "bottom")

ggplot(plot_df, aes(x = vals, colour = Donor))+
     facet_wrap(~Condition)+
     geom_density()+
     labs(x = "zonation (binned)", y = "proportion")+
     theme_bw()+
     theme(legend.title.align = 0,
           legend.position = "bottom")


# data_gam_h$bins20 = cut(data_gam_h$DC1, 20)
# mean_bins = apply(data_gam_h[,2:501], 2, function(x) tapply(x, data_gam_h$bins20, mean))
# cor_reg_bins = cor(as.matrix(hep_cells$regenerating@assays$SCT@data[colnames(mean_bins),]), 
#                    t(mean_bins), method = "sp")
# max_cor_regen = apply(cor_reg_bins, 1, which.max)
# testdf = data.frame(bins = max_cor_regen, dons = hep_cells$regenerating$Donor)
# ggplot(testdf, aes(x = bins, fill = dons))+geom_bar()+facet_wrap(~dons, scales = "free_y")

reg_df = data.frame("pt" = reg_pred, "don" = hep_cells$regenerating$Donor)
reg_df = cbind(reg_df, Matrix::t(hep_cells$regenerating@assays$SCT@data[gfresh,]))
ggplot(reg_df, aes(x = pt, y = SAA1, colour = don))+
  geom_point()+
  theme_classic()+
  facet_wrap(~don)+
  geom_hline(yintercept = 3)+
  geom_vline(xintercept = c(0,1))


ggplot(reg_df, aes(x = SAA2, y = GLUL, colour = don))+
  geom_point()+
  theme_classic()+
  facet_wrap(~don)


traj_list = list("healthy" = data_gam_h$DC1, "embolised" = emb_pred, "regenerating" = reg_pred)

Find varying genes - using all cells along the ranked pseudotime

cor_h = t(cor(traj_list$healthy, as.matrix(Matrix::t(hep_cells$healthy@assays$SCT@data)),
              method = "sp"))
cor_emb = t(cor(traj_list$embolised, as.matrix(Matrix::t(hep_cells$embolised@assays$SCT@data)),
                method = "sp"))
the standard deviation is zero
cor_reg = t(cor(traj_list$regenerating, as.matrix(Matrix::t(hep_cells$regenerating@assays$SCT@data)),
                method = "sp"))
the standard deviation is zero
l = list(cor_h, cor_emb, cor_reg)
all_corr = Reduce(function(x, y) merge(x,y, by = "rn", all = T), 
                  lapply(l, function(x) data.frame(x, rn = row.names(x))))
colnames(all_corr) = c("genes", "healthy", "embolised", "regenerating")

write.csv(all_corr, file = paste0("results/zonation_cond/hep_correlations_zonation_rank.csv"),
          col.names = T, row.names = F, quote = F)
attempt to set 'col.names' ignored

Save trajectories and genes

qvals_list = list()
fits_list = list()

# get HVG from all conditions
hvg = unique(c(hep_cells[["healthy"]]@assays$SCT@var.features,
               hep_cells[["embolised"]]@assays$SCT@var.features,
               hep_cells[["regenerating"]]@assays$SCT@var.features))
hvg = hvg[hvg %in% rownames(hep_cells[["healthy"]]@assays$SCT@data) &
            hvg %in% rownames(hep_cells[["embolised"]]@assays$SCT@data) &
            hvg %in% rownames(hep_cells[["regenerating"]]@assays$SCT@data)]

for(cond in names(hep_cells)){
  print(cond)
  # Fit GAM for each gene using pseudotime as independent variable.
  t = traj_list[[cond]]
  gene_fit_p = c()
  gene_fit_vals = data.frame(row.names = 1:100)
  for(i in 1:length(hvg)){
    g = hvg[i]
    z = hep_cells[[cond]]@assays$SCT@data[g,]
    
    d = data.frame(z=z, t=t)
    tmp = suppressMessages(gam(z ~ ns(t, df = 3), data=d))
    
    # bins for model fitting
    bb = seq(min(t), max(t), length.out = 100)
    gene_fit_vals[,g] = suppressMessages(predict(tmp,
                                                 newdata = data.frame(t = bb)))
    p = summary(tmp)$parametric.anova$`Pr(>F)`[1]
    gene_fit_p = c(gene_fit_p, p)
    names(gene_fit_p)[length(gene_fit_p)] = g
  }
  
  if(sum(is.na(gene_fit_p))>0){
    gene_fit_p[is.na(gene_fit_p)] = 1
  }
  gene_fit_p = fdrtool::fdrtool(gene_fit_p, statistic="pvalue", plot=F, 
                                verbose=F, cutoff.method="pct0", pct0=0.9)$qval
  
  qvals_list[[cond]] = gene_fit_p
  fits_list[[cond]] = gene_fit_vals
}
[1] "healthy"
[1] "embolised"
[1] "regenerating"

Find varying genes - normalised by bins along the ranked pseudotime

save(fits_list, qvals_list, traj_list, 
     file = "results/zonation_cond/hep_traj_qval_fits_rank.RData")

Save trajectories and genes

save(fits_b_list, qvals_b_list, traj_list, 
     file = "results/zonation_cond/hep_binned_traj_qval_fits_rank.RData")

Find varying genes - using each donor as a replicate and calculating the average expression for each of 100 bins

save(fits_b_list, qvals_b_list, traj_list, 
     file = "results/zonation_cond/hep_binned_traj_qval_fits_rank.RData")

Save trajectories and genes

save(fits_db_list, qvals_db_list, traj_list, 
     file = "results/zonation_cond/hep_binnedDonor_traj_qval_fits_rank.RData")

Plot cell distributions in pseudotime

plot_df = data.frame("pseudotime" = c(traj_list$healthy, traj_list$embolised,
                                      traj_list$regenerating),
           "donor" = c(hep_cells$healthy@meta.data[,"Donor"],
                       hep_cells$embolised@meta.data[names(traj_list$embolised),"Donor"],
                       hep_cells$regenerating@meta.data[names(traj_list$regenerating),"Donor"]),
           "cond" = c(rep("healthy", length(traj_list$healthy)), 
                      rep("embolised", length(traj_list$embolised)),
                      rep("regenerating", length(traj_list$regenerating))))

pdf("results/zonation_cond/hep_cell_distributions_zonation.pdf", useDingbats = F, 
    width = 6, height = 5)
ggplot(plot_df, aes(x = donor, y = pseudotime, fill = cond))+
  geom_boxplot(notch = T)+
  theme_bw()

ggplot(plot_df, aes(x = pseudotime, group = donor, colour = donor))+
  facet_wrap(~cond)+
  geom_density()+
  theme_bw()

ggplot(plot_df, aes(x = pseudotime, colour = cond))+
  geom_density()+
  theme_bw()
dev.off()

Do cross validation on the healthy trajectory

plot_df = data.frame("pseudotime" = c(traj_list$healthy, traj_list$embolised,
                                      traj_list$regenerating),
           "donor" = c(hep_cells$healthy@meta.data[,"Donor"],
                       hep_cells$embolised@meta.data[names(traj_list$embolised),"Donor"],
                       hep_cells$regenerating@meta.data[names(traj_list$regenerating),"Donor"]),
           "cond" = c(rep("healthy", length(traj_list$healthy)), 
                      rep("embolised", length(traj_list$embolised)),
                      rep("regenerating", length(traj_list$regenerating))))

pdf("results/zonation_cond/hep_cell_distributions_zonation.pdf", useDingbats = F, 
    width = 6, height = 5)
ggplot(plot_df, aes(x = donor, y = pseudotime, fill = cond))+
  geom_boxplot(notch = T)+
  theme_bw()

ggplot(plot_df, aes(x = pseudotime, group = donor, colour = donor))+
  facet_wrap(~cond)+
  geom_density()+
  theme_bw()

ggplot(plot_df, aes(x = pseudotime, colour = cond))+
  geom_density()+
  theme_bw()
dev.off()
null device 
          1 

Save zonation results

sig_genes = qvals_list$healthy[qvals_list$healthy<=0.05]
top_sig_genes = names(sig_genes[order(sig_genes, decreasing = F)][1:500]) 

data_gam_h = cbind(data.frame("DC1" = traj_list$healthy),
                   Matrix::t(hep_cells$health@assays$SCT@data[top_sig_genes,]))
colnames(data_gam_h) = gsub("-", "_", colnames(data_gam_h), fixed = T)
colnames(data_gam_h) = gsub(".", "_", colnames(data_gam_h), fixed = T)

cv_gam = CVgam(formula = DC1 ~ ., data = data_gam_h, nfold = 10, seed = 1, method = "glm.fit")
CV-mse-GAM  
     0.0106 
pdf("results/zonation_healthy/hep_original_fitted_pseudotime.pdf", useDingbats = F, 
    width = 6, height = 5)
plot(data_gam_h$DC1, cv_gam$fitted, xlab = "Healthy pseudotime (DC1)",ylab = "Predicted Healthy",
     main = paste0("Healthy original vs fitted pseudotime\nMSE: ", round(cv_gam$cvscale, 6)),
     pch = 19, cex = 0.5)
abline(0,1, col = "blue", lwd = 3)
dev.off()
null device 
          1 

Endothelial cells

Subset and process each condition (only LSEC)

df_h = data.table("zonation_pt" = traj_list$healthy)
df_h$zonation_int = cut(df_h$zonation_pt, 10)

df_e = data.table("zonation_pt" = traj_list$embolised)
df_e = df_h[df_e, on="zonation_pt", roll=Inf, rollends = T]

df_r = data.table("zonation_pt" = traj_list$regenerating)
df_r = df_h[df_r, on="zonation_pt", roll=Inf, rollends = T]
Error in colnamesInt(i, unname(on), check_dups = FALSE) : 
  argument specifying columns specify non existing column(s): cols[1]='zonation_pt'

Healthy analysis

For healthy cells (which will serve as reference), do additional filtering and reprocessing

DimPlot(end_cells$healthy, reduction = "umap", group.by = "allcells_clusters")
gfresh = c("CLEC1B", "CLEC4G", "LYVE1", "CD14",  # central 
           "PECAM1", "AQP1", "VWF", "CD34", # portal
           "ALB", "MKI67", "TRAC", "DCN") 
FeaturePlot(end_cells$healthy, features = gfresh, reduction = "umap")

outcells = (end_cells$healthy@reductions$umap@cell.embeddings[,1]<(-4)) # this removes what seem to be central hepatocytes
end_cells$healthy = end_cells$healthy[,!outcells]

end_cells$healthy = suppressWarnings(SCTransform(end_cells$healthy, do.correct.umi = T, 
                                                 verbose = F,
                                                 vars.to.regress = c("unique_name","nCount_RNA"),
                                                 variable.features.rv.th = 1, seed.use = 1,
                                                 return.only.var.genes = F, 
                                                 variable.features.n = NULL))
end_cells$healthy = RunPCA(end_cells$healthy, verbose = F)
end_cells$healthy = RunUMAP(end_cells$healthy, dims = 1:15, verbose = F)
DimPlot(end_cells$healthy, reduction = "umap", group.by = "unique_name")

end_cells$healthy = FindNeighbors(end_cells$healthy, reduction = "pca", dims = 1:15,
                                  force.recalc = T, graph.name = "end")
end_cells$healthy = FindClusters(end_cells$healthy, graph.name = "end", algorithm = 2, 
                                 resolution = seq(0.1,0.8,0.1), verbose = F)
DimPlot(end_cells$healthy, reduction = "umap", group.by = "end_res.0.2")
end_cells$healthy = SetIdent(end_cells$healthy, value = "end_res.0.2")
mk_end_h = FindAllMarkers(end_cells$healthy, assay = "SCT")

saveRDS(end_cells$healthy, file = "results/zonation_healthy/end_healthy_srat.RDS")

Save endothelial cells

saveRDS(end_cells, file="results/zonation_cond/end_cells.RDS")

Plot expression of zonation markers

gfresh = c("CLEC1B", "CLEC4G", "LYVE1", "CD14",  # central 
           "PECAM1", "AQP1", "VWF", "CD34") # portal

png("results/zonation_healthy/healthy_hepatocyte_zonation_markers_violin.png", 
    height = 1400, width = 1400)
VlnPlot(end_cells$healthy, features = gfresh, group.by = "names_clusters", 
        pt.size = 0, sort = "increasing")
dev.off()
png("results/zonation_healthy/healthy_hepatocyte_zonation_markers_umap.png", 
    height = 1400, width = 1400)
FeaturePlot(end_cells$healthy, features = gfresh, pt.size = 0.6)
dev.off()

Obtain Diffusion maps projection

set.seed(1)
dm = DiffusionMap(end_cells$healthy@reductions$pca@cell.embeddings[,1:4], 
                  rotate = T, n_eigs = 5)
dpt = DPT(dm, tips = c(2382, 1379, 424))
plot(dpt, col_by = "DPT424")
plot(dpt, col_by = "branch")

dm = DiffusionMap(end_cells$healthy@reductions$pca@cell.embeddings[dpt$DPT424<15,1:4], 
                  rotate = T, n_eigs = 5)
dpt = DPT(dm)
plot(dpt, col_by = "DPT363")

end_cells$healthy = end_cells$healthy[,names(dpt@dm$DC1)]

Check expression of some genes

set.seed(1)
dm = DiffusionMap(end_cells$healthy@reductions$pca@cell.embeddings[,1:4], 
                  rotate = T, n_eigs = 5)
dpt = DPT(dm, tips = c(2382, 1379, 424))
plot(dpt, col_by = "DPT424")

plot(dpt, col_by = "branch")


dm = DiffusionMap(end_cells$healthy@reductions$pca@cell.embeddings[dpt$DPT424<15,1:4], 
                  rotate = T, n_eigs = 5)
dpt = DPT(dm)
plot(dpt, col_by = "DPT363")


end_cells$healthy = end_cells$healthy[,names(dpt@dm$DC1)]

We’ll be using DPT363 as the pseudotime. This was the best approximation possible to the zonation gradient based on diffusion maps

pdf("results/zonation_healthy/end_distributions_DPT363.pdf", useDingbats = F, 
    height = 4, width = 9)
plt1 = ggplot(df_dc, aes(x = DPT, y = names_clusters, 
                  group = names_clusters, fill = names_clusters))+
  geom_density_ridges2(alpha = 0.4)+
  theme_bw()+
  theme(legend.position = "none")

plt2 = ggplot(df_dc, aes(x = DPT, y = unique_name, 
                  group = unique_name, fill = unique_name))+
  geom_density_ridges2(alpha = 0.4)+
  theme_bw()+
  theme(legend.position = "none")

cowplot::plot_grid(plt1, plt2, ncol = 2, align = "h")
dev.off()

Find genes varying along the pseudotime

# will only use variable genes
hvg = end_cells$healthy@assays$SCT@var.features

# Fit GAM for each gene using pseudotime as independent variable.
t = scales::rescale(rank(-df_dc$DPT), c(0.00001, 0.99999))
gene_fit_p = c()
gene_fit_vals = data.frame(row.names = 1:100)
for(i in 1:length(hvg)){
  g = hvg[i]
  z = end_cells$healthy@assays$SCT@data[g,]
  
  d = data.frame(z=z, t=t)
  tmp = suppressMessages(gam(z ~ ns(t, df = 3), data=d))
  
  # bins for model fitting
  bb = seq(min(t), max(t), length.out = 100)
  gene_fit_vals[,g] = suppressMessages(predict(tmp,
                                               newdata = data.frame(t = bb)))
  p = summary(tmp)$parametric.anova$`Pr(>F)`[1]
  gene_fit_p = c(gene_fit_p, p)
  names(gene_fit_p)[length(gene_fit_p)] = g
}

gene_fit_p = fdrtool::fdrtool(gene_fit_p, statistic="pvalue", plot=F, 
                              verbose=F, cutoff.method="pct0", pct0=0.9)$qval

Analysis of other conditions

First, clean the data from the other two conditions

# will only use variable genes
hvg = end_cells$healthy@assays$SCT@var.features

# Fit GAM for each gene using pseudotime as independent variable.
t = scales::rescale(rank(-df_dc$DPT), c(0.00001, 0.99999))
gene_fit_p = c()
gene_fit_vals = data.frame(row.names = 1:100)
for(i in 1:length(hvg)){
  g = hvg[i]
  z = end_cells$healthy@assays$SCT@data[g,]
  
  d = data.frame(z=z, t=t)
  tmp = suppressMessages(gam(z ~ ns(t, df = 3), data=d))
  
  # bins for model fitting
  bb = seq(min(t), max(t), length.out = 100)
  gene_fit_vals[,g] = suppressMessages(predict(tmp,
                                               newdata = data.frame(t = bb)))
  p = summary(tmp)$parametric.anova$`Pr(>F)`[1]
  gene_fit_p = c(gene_fit_p, p)
  names(gene_fit_p)[length(gene_fit_p)] = g
}

gene_fit_p = fdrtool::fdrtool(gene_fit_p, statistic="pvalue", plot=F, 
                              verbose=F, cutoff.method="pct0", pct0=0.9)$qval

Now use the top genes to learn the pseudozonation coordinates and project the embolised and regenerating data

end_cells$embolised = FindNeighbors(end_cells$embolised, reduction = "pca", dims = 1:20,
                                    force.recalc = T, graph.name = "endo")
Computing nearest neighbor graph
Computing SNN
end_cells$embolised = FindClusters(end_cells$embolised, graph.name = "endo", algorithm = 2, 
                                   resolution = 0.8, verbose = F)
DimPlot(end_cells$embolised, reduction = "umap", group.by = "endo_res.0.8")

end_cells$embolised = SetIdent(end_cells$embolised, value = "endo_res.0.8")
mk_emb = FindAllMarkers(end_cells$embolised, assay = "SCT")
Calculating cluster 0

  |                                                  | 0 % ~calculating  
  |+                                                 | 1 % ~02s          
  |++                                                | 3 % ~02s          
  |+++                                               | 4 % ~03s          
  |+++                                               | 6 % ~02s          
  |++++                                              | 7 % ~02s          
  |+++++                                             | 9 % ~02s          
  |++++++                                            | 10% ~02s          
  |++++++                                            | 12% ~02s          
  |+++++++                                           | 13% ~02s          
  |++++++++                                          | 14% ~02s          
  |++++++++                                          | 16% ~02s          
  |+++++++++                                         | 17% ~02s          
  |++++++++++                                        | 19% ~02s          
  |+++++++++++                                       | 20% ~02s          
  |+++++++++++                                       | 22% ~02s          
  |++++++++++++                                      | 23% ~02s          
  |+++++++++++++                                     | 25% ~02s          
  |++++++++++++++                                    | 26% ~02s          
  |++++++++++++++                                    | 28% ~02s          
  |+++++++++++++++                                   | 29% ~02s          
  |++++++++++++++++                                  | 30% ~02s          
  |++++++++++++++++                                  | 32% ~02s          
  |+++++++++++++++++                                 | 33% ~02s          
  |++++++++++++++++++                                | 35% ~02s          
  |+++++++++++++++++++                               | 36% ~02s          
  |+++++++++++++++++++                               | 38% ~02s          
  |++++++++++++++++++++                              | 39% ~02s          
  |+++++++++++++++++++++                             | 41% ~02s          
  |++++++++++++++++++++++                            | 42% ~02s          
  |++++++++++++++++++++++                            | 43% ~02s          
  |+++++++++++++++++++++++                           | 45% ~02s          
  |++++++++++++++++++++++++                          | 46% ~02s          
  |++++++++++++++++++++++++                          | 48% ~01s          
  |+++++++++++++++++++++++++                         | 49% ~01s          
  |++++++++++++++++++++++++++                        | 51% ~01s          
  |+++++++++++++++++++++++++++                       | 52% ~01s          
  |+++++++++++++++++++++++++++                       | 54% ~01s          
  |++++++++++++++++++++++++++++                      | 55% ~01s          
  |+++++++++++++++++++++++++++++                     | 57% ~01s          
  |+++++++++++++++++++++++++++++                     | 58% ~01s          
  |++++++++++++++++++++++++++++++                    | 59% ~01s          
  |+++++++++++++++++++++++++++++++                   | 61% ~01s          
  |++++++++++++++++++++++++++++++++                  | 62% ~01s          
  |++++++++++++++++++++++++++++++++                  | 64% ~01s          
  |+++++++++++++++++++++++++++++++++                 | 65% ~01s          
  |++++++++++++++++++++++++++++++++++                | 67% ~01s          
  |+++++++++++++++++++++++++++++++++++               | 68% ~01s          
  |+++++++++++++++++++++++++++++++++++               | 70% ~01s          
  |++++++++++++++++++++++++++++++++++++              | 71% ~01s          
  |+++++++++++++++++++++++++++++++++++++             | 72% ~01s          
  |+++++++++++++++++++++++++++++++++++++             | 74% ~01s          
  |++++++++++++++++++++++++++++++++++++++            | 75% ~01s          
  |+++++++++++++++++++++++++++++++++++++++           | 77% ~01s          
  |++++++++++++++++++++++++++++++++++++++++          | 78% ~01s          
  |++++++++++++++++++++++++++++++++++++++++          | 80% ~01s          
  |+++++++++++++++++++++++++++++++++++++++++         | 81% ~01s          
  |++++++++++++++++++++++++++++++++++++++++++        | 83% ~01s          
  |+++++++++++++++++++++++++++++++++++++++++++       | 84% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++       | 86% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++      | 87% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++++     | 88% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++++     | 90% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++++    | 91% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++++++   | 93% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++++++  | 94% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++++++  | 96% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++++++++ | 97% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++++++++| 99% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=03s  
Calculating cluster 1

  |                                                  | 0 % ~calculating  
  |+                                                 | 1 % ~04s          
  |++                                                | 3 % ~04s          
  |++                                                | 4 % ~04s          
  |+++                                               | 5 % ~04s          
  |++++                                              | 7 % ~04s          
  |++++                                              | 8 % ~04s          
  |+++++                                             | 9 % ~04s          
  |++++++                                            | 11% ~04s          
  |++++++                                            | 12% ~04s          
  |+++++++                                           | 13% ~04s          
  |++++++++                                          | 14% ~04s          
  |++++++++                                          | 16% ~03s          
  |+++++++++                                         | 17% ~03s          
  |++++++++++                                        | 18% ~03s          
  |++++++++++                                        | 20% ~03s          
  |+++++++++++                                       | 21% ~03s          
  |++++++++++++                                      | 22% ~03s          
  |++++++++++++                                      | 24% ~03s          
  |+++++++++++++                                     | 25% ~03s          
  |++++++++++++++                                    | 26% ~03s          
  |++++++++++++++                                    | 28% ~03s          
  |+++++++++++++++                                   | 29% ~03s          
  |++++++++++++++++                                  | 30% ~03s          
  |++++++++++++++++                                  | 32% ~03s          
  |+++++++++++++++++                                 | 33% ~03s          
  |++++++++++++++++++                                | 34% ~03s          
  |++++++++++++++++++                                | 36% ~03s          
  |+++++++++++++++++++                               | 37% ~03s          
  |++++++++++++++++++++                              | 38% ~03s          
  |++++++++++++++++++++                              | 39% ~03s          
  |+++++++++++++++++++++                             | 41% ~02s          
  |++++++++++++++++++++++                            | 42% ~02s          
  |++++++++++++++++++++++                            | 43% ~02s          
  |+++++++++++++++++++++++                           | 45% ~02s          
  |++++++++++++++++++++++++                          | 46% ~02s          
  |++++++++++++++++++++++++                          | 47% ~02s          
  |+++++++++++++++++++++++++                         | 49% ~02s          
  |+++++++++++++++++++++++++                         | 50% ~02s          
  |++++++++++++++++++++++++++                        | 51% ~02s          
  |+++++++++++++++++++++++++++                       | 53% ~02s          
  |+++++++++++++++++++++++++++                       | 54% ~02s          
  |++++++++++++++++++++++++++++                      | 55% ~02s          
  |+++++++++++++++++++++++++++++                     | 57% ~02s          
  |+++++++++++++++++++++++++++++                     | 58% ~02s          
  |++++++++++++++++++++++++++++++                    | 59% ~02s          
  |+++++++++++++++++++++++++++++++                   | 61% ~02s          
  |+++++++++++++++++++++++++++++++                   | 62% ~02s          
  |++++++++++++++++++++++++++++++++                  | 63% ~02s          
  |+++++++++++++++++++++++++++++++++                 | 64% ~01s          
  |+++++++++++++++++++++++++++++++++                 | 66% ~01s          
  |++++++++++++++++++++++++++++++++++                | 67% ~01s          
  |+++++++++++++++++++++++++++++++++++               | 68% ~01s          
  |+++++++++++++++++++++++++++++++++++               | 70% ~01s          
  |++++++++++++++++++++++++++++++++++++              | 71% ~01s          
  |+++++++++++++++++++++++++++++++++++++             | 72% ~01s          
  |+++++++++++++++++++++++++++++++++++++             | 74% ~01s          
  |++++++++++++++++++++++++++++++++++++++            | 75% ~01s          
  |+++++++++++++++++++++++++++++++++++++++           | 76% ~01s          
  |+++++++++++++++++++++++++++++++++++++++           | 78% ~01s          
  |++++++++++++++++++++++++++++++++++++++++          | 79% ~01s          
  |+++++++++++++++++++++++++++++++++++++++++         | 80% ~01s          
  |+++++++++++++++++++++++++++++++++++++++++         | 82% ~01s          
  |++++++++++++++++++++++++++++++++++++++++++        | 83% ~01s          
  |+++++++++++++++++++++++++++++++++++++++++++       | 84% ~01s          
  |+++++++++++++++++++++++++++++++++++++++++++       | 86% ~01s          
  |++++++++++++++++++++++++++++++++++++++++++++      | 87% ~01s          
  |+++++++++++++++++++++++++++++++++++++++++++++     | 88% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++++     | 89% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++++    | 91% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++++++   | 92% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++++++   | 93% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++++++  | 95% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++++++++ | 96% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++++++++ | 97% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++++++++| 99% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=04s  
Calculating cluster 2

  |                                                  | 0 % ~calculating  
  |+                                                 | 1 % ~02s          
  |++                                                | 2 % ~02s          
  |++                                                | 3 % ~02s          
  |+++                                               | 4 % ~02s          
  |+++                                               | 5 % ~02s          
  |++++                                              | 6 % ~02s          
  |++++                                              | 7 % ~02s          
  |+++++                                             | 9 % ~02s          
  |+++++                                             | 10% ~02s          
  |++++++                                            | 11% ~02s          
  |++++++                                            | 12% ~02s          
  |+++++++                                           | 13% ~02s          
  |+++++++                                           | 14% ~02s          
  |++++++++                                          | 15% ~02s          
  |++++++++                                          | 16% ~02s          
  |+++++++++                                         | 17% ~02s          
  |++++++++++                                        | 18% ~02s          
  |++++++++++                                        | 19% ~02s          
  |+++++++++++                                       | 20% ~02s          
  |+++++++++++                                       | 21% ~02s          
  |++++++++++++                                      | 22% ~02s          
  |++++++++++++                                      | 23% ~02s          
  |+++++++++++++                                     | 24% ~02s          
  |+++++++++++++                                     | 26% ~02s          
  |++++++++++++++                                    | 27% ~02s          
  |++++++++++++++                                    | 28% ~02s          
  |+++++++++++++++                                   | 29% ~02s          
  |+++++++++++++++                                   | 30% ~02s          
  |++++++++++++++++                                  | 31% ~02s          
  |++++++++++++++++                                  | 32% ~02s          
  |+++++++++++++++++                                 | 33% ~02s          
  |++++++++++++++++++                                | 34% ~02s          
  |++++++++++++++++++                                | 35% ~02s          
  |+++++++++++++++++++                               | 36% ~02s          
  |+++++++++++++++++++                               | 37% ~01s          
  |++++++++++++++++++++                              | 38% ~01s          
  |++++++++++++++++++++                              | 39% ~01s          
  |+++++++++++++++++++++                             | 40% ~01s          
  |+++++++++++++++++++++                             | 41% ~01s          
  |++++++++++++++++++++++                            | 43% ~01s          
  |++++++++++++++++++++++                            | 44% ~01s          
  |+++++++++++++++++++++++                           | 45% ~01s          
  |+++++++++++++++++++++++                           | 46% ~01s          
  |++++++++++++++++++++++++                          | 47% ~01s          
  |++++++++++++++++++++++++                          | 48% ~01s          
  |+++++++++++++++++++++++++                         | 49% ~01s          
  |+++++++++++++++++++++++++                         | 50% ~01s          
  |++++++++++++++++++++++++++                        | 51% ~01s          
  |+++++++++++++++++++++++++++                       | 52% ~01s          
  |+++++++++++++++++++++++++++                       | 53% ~01s          
  |++++++++++++++++++++++++++++                      | 54% ~01s          
  |++++++++++++++++++++++++++++                      | 55% ~01s          
  |+++++++++++++++++++++++++++++                     | 56% ~01s          
  |+++++++++++++++++++++++++++++                     | 57% ~01s          
  |++++++++++++++++++++++++++++++                    | 59% ~01s          
  |++++++++++++++++++++++++++++++                    | 60% ~01s          
  |+++++++++++++++++++++++++++++++                   | 61% ~01s          
  |+++++++++++++++++++++++++++++++                   | 62% ~01s          
  |++++++++++++++++++++++++++++++++                  | 63% ~01s          
  |++++++++++++++++++++++++++++++++                  | 64% ~01s          
  |+++++++++++++++++++++++++++++++++                 | 65% ~01s          
  |+++++++++++++++++++++++++++++++++                 | 66% ~01s          
  |++++++++++++++++++++++++++++++++++                | 67% ~01s          
  |+++++++++++++++++++++++++++++++++++               | 68% ~01s          
  |+++++++++++++++++++++++++++++++++++               | 69% ~01s          
  |++++++++++++++++++++++++++++++++++++              | 70% ~01s          
  |++++++++++++++++++++++++++++++++++++              | 71% ~01s          
  |+++++++++++++++++++++++++++++++++++++             | 72% ~01s          
  |+++++++++++++++++++++++++++++++++++++             | 73% ~01s          
  |++++++++++++++++++++++++++++++++++++++            | 74% ~01s          
  |++++++++++++++++++++++++++++++++++++++            | 76% ~01s          
  |+++++++++++++++++++++++++++++++++++++++           | 77% ~01s          
  |+++++++++++++++++++++++++++++++++++++++           | 78% ~00s          
  |++++++++++++++++++++++++++++++++++++++++          | 79% ~00s          
  |++++++++++++++++++++++++++++++++++++++++          | 80% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++         | 81% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++         | 82% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++        | 83% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++       | 84% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++       | 85% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++      | 86% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++      | 87% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++++     | 88% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++++     | 89% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++++    | 90% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++++    | 91% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++++++   | 93% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++++++   | 94% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++++++  | 95% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++++++  | 96% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++++++++ | 97% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++++++++ | 98% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++++++++| 99% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=02s  
Calculating cluster 3

  |                                                  | 0 % ~calculating  
  |+                                                 | 1 % ~03s          
  |++                                                | 2 % ~03s          
  |++                                                | 3 % ~03s          
  |+++                                               | 4 % ~03s          
  |+++                                               | 5 % ~03s          
  |++++                                              | 7 % ~03s          
  |++++                                              | 8 % ~03s          
  |+++++                                             | 9 % ~02s          
  |+++++                                             | 10% ~02s          
  |++++++                                            | 11% ~02s          
  |++++++                                            | 12% ~02s          
  |+++++++                                           | 13% ~02s          
  |++++++++                                          | 14% ~02s          
  |++++++++                                          | 15% ~02s          
  |+++++++++                                         | 16% ~02s          
  |+++++++++                                         | 17% ~02s          
  |++++++++++                                        | 18% ~02s          
  |++++++++++                                        | 20% ~02s          
  |+++++++++++                                       | 21% ~02s          
  |+++++++++++                                       | 22% ~02s          
  |++++++++++++                                      | 23% ~02s          
  |++++++++++++                                      | 24% ~02s          
  |+++++++++++++                                     | 25% ~02s          
  |++++++++++++++                                    | 26% ~02s          
  |++++++++++++++                                    | 27% ~02s          
  |+++++++++++++++                                   | 28% ~02s          
  |+++++++++++++++                                   | 29% ~02s          
  |++++++++++++++++                                  | 30% ~02s          
  |++++++++++++++++                                  | 32% ~02s          
  |+++++++++++++++++                                 | 33% ~02s          
  |+++++++++++++++++                                 | 34% ~02s          
  |++++++++++++++++++                                | 35% ~02s          
  |++++++++++++++++++                                | 36% ~02s          
  |+++++++++++++++++++                               | 37% ~02s          
  |++++++++++++++++++++                              | 38% ~02s          
  |++++++++++++++++++++                              | 39% ~02s          
  |+++++++++++++++++++++                             | 40% ~02s          
  |+++++++++++++++++++++                             | 41% ~02s          
  |++++++++++++++++++++++                            | 42% ~01s          
  |++++++++++++++++++++++                            | 43% ~01s          
  |+++++++++++++++++++++++                           | 45% ~01s          
  |+++++++++++++++++++++++                           | 46% ~01s          
  |++++++++++++++++++++++++                          | 47% ~01s          
  |++++++++++++++++++++++++                          | 48% ~01s          
  |+++++++++++++++++++++++++                         | 49% ~01s          
  |+++++++++++++++++++++++++                         | 50% ~01s          
  |++++++++++++++++++++++++++                        | 51% ~01s          
  |+++++++++++++++++++++++++++                       | 52% ~01s          
  |+++++++++++++++++++++++++++                       | 53% ~01s          
  |++++++++++++++++++++++++++++                      | 54% ~01s          
  |++++++++++++++++++++++++++++                      | 55% ~01s          
  |+++++++++++++++++++++++++++++                     | 57% ~01s          
  |+++++++++++++++++++++++++++++                     | 58% ~01s          
  |++++++++++++++++++++++++++++++                    | 59% ~01s          
  |++++++++++++++++++++++++++++++                    | 60% ~01s          
  |+++++++++++++++++++++++++++++++                   | 61% ~01s          
  |+++++++++++++++++++++++++++++++                   | 62% ~01s          
  |++++++++++++++++++++++++++++++++                  | 63% ~01s          
  |+++++++++++++++++++++++++++++++++                 | 64% ~01s          
  |+++++++++++++++++++++++++++++++++                 | 65% ~01s          
  |++++++++++++++++++++++++++++++++++                | 66% ~01s          
  |++++++++++++++++++++++++++++++++++                | 67% ~01s          
  |+++++++++++++++++++++++++++++++++++               | 68% ~01s          
  |+++++++++++++++++++++++++++++++++++               | 70% ~01s          
  |++++++++++++++++++++++++++++++++++++              | 71% ~01s          
  |++++++++++++++++++++++++++++++++++++              | 72% ~01s          
  |+++++++++++++++++++++++++++++++++++++             | 73% ~01s          
  |+++++++++++++++++++++++++++++++++++++             | 74% ~01s          
  |++++++++++++++++++++++++++++++++++++++            | 75% ~01s          
  |+++++++++++++++++++++++++++++++++++++++           | 76% ~01s          
  |+++++++++++++++++++++++++++++++++++++++           | 77% ~01s          
  |++++++++++++++++++++++++++++++++++++++++          | 78% ~01s          
  |++++++++++++++++++++++++++++++++++++++++          | 79% ~01s          
  |+++++++++++++++++++++++++++++++++++++++++         | 80% ~01s          
  |+++++++++++++++++++++++++++++++++++++++++         | 82% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++        | 83% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++        | 84% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++       | 85% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++       | 86% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++      | 87% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++++     | 88% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++++     | 89% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++++    | 90% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++++    | 91% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++++++   | 92% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++++++   | 93% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++++++  | 95% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++++++  | 96% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++++++++ | 97% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++++++++ | 98% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++++++++| 99% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=03s  
Calculating cluster 4

  |                                                  | 0 % ~calculating  
  |+                                                 | 1 % ~04s          
  |++                                                | 2 % ~03s          
  |++                                                | 3 % ~03s          
  |+++                                               | 4 % ~03s          
  |+++                                               | 5 % ~03s          
  |++++                                              | 7 % ~03s          
  |++++                                              | 8 % ~03s          
  |+++++                                             | 9 % ~03s          
  |+++++                                             | 10% ~03s          
  |++++++                                            | 11% ~03s          
  |++++++                                            | 12% ~03s          
  |+++++++                                           | 13% ~03s          
  |++++++++                                          | 14% ~03s          
  |++++++++                                          | 15% ~03s          
  |+++++++++                                         | 16% ~03s          
  |+++++++++                                         | 17% ~03s          
  |++++++++++                                        | 18% ~03s          
  |++++++++++                                        | 20% ~03s          
  |+++++++++++                                       | 21% ~03s          
  |+++++++++++                                       | 22% ~03s          
  |++++++++++++                                      | 23% ~03s          
  |++++++++++++                                      | 24% ~03s          
  |+++++++++++++                                     | 25% ~02s          
  |++++++++++++++                                    | 26% ~02s          
  |++++++++++++++                                    | 27% ~02s          
  |+++++++++++++++                                   | 28% ~02s          
  |+++++++++++++++                                   | 29% ~02s          
  |++++++++++++++++                                  | 30% ~02s          
  |++++++++++++++++                                  | 32% ~02s          
  |+++++++++++++++++                                 | 33% ~02s          
  |+++++++++++++++++                                 | 34% ~02s          
  |++++++++++++++++++                                | 35% ~02s          
  |++++++++++++++++++                                | 36% ~02s          
  |+++++++++++++++++++                               | 37% ~02s          
  |++++++++++++++++++++                              | 38% ~02s          
  |++++++++++++++++++++                              | 39% ~02s          
  |+++++++++++++++++++++                             | 40% ~02s          
  |+++++++++++++++++++++                             | 41% ~02s          
  |++++++++++++++++++++++                            | 42% ~02s          
  |++++++++++++++++++++++                            | 43% ~02s          
  |+++++++++++++++++++++++                           | 45% ~02s          
  |+++++++++++++++++++++++                           | 46% ~02s          
  |++++++++++++++++++++++++                          | 47% ~02s          
  |++++++++++++++++++++++++                          | 48% ~02s          
  |+++++++++++++++++++++++++                         | 49% ~02s          
  |+++++++++++++++++++++++++                         | 50% ~02s          
  |++++++++++++++++++++++++++                        | 51% ~02s          
  |+++++++++++++++++++++++++++                       | 52% ~02s          
  |+++++++++++++++++++++++++++                       | 53% ~02s          
  |++++++++++++++++++++++++++++                      | 54% ~02s          
  |++++++++++++++++++++++++++++                      | 55% ~01s          
  |+++++++++++++++++++++++++++++                     | 57% ~01s          
  |+++++++++++++++++++++++++++++                     | 58% ~01s          
  |++++++++++++++++++++++++++++++                    | 59% ~01s          
  |++++++++++++++++++++++++++++++                    | 60% ~01s          
  |+++++++++++++++++++++++++++++++                   | 61% ~01s          
  |+++++++++++++++++++++++++++++++                   | 62% ~01s          
  |++++++++++++++++++++++++++++++++                  | 63% ~01s          
  |+++++++++++++++++++++++++++++++++                 | 64% ~01s          
  |+++++++++++++++++++++++++++++++++                 | 65% ~01s          
  |++++++++++++++++++++++++++++++++++                | 66% ~01s          
  |++++++++++++++++++++++++++++++++++                | 67% ~01s          
  |+++++++++++++++++++++++++++++++++++               | 68% ~01s          
  |+++++++++++++++++++++++++++++++++++               | 70% ~01s          
  |++++++++++++++++++++++++++++++++++++              | 71% ~01s          
  |++++++++++++++++++++++++++++++++++++              | 72% ~01s          
  |+++++++++++++++++++++++++++++++++++++             | 73% ~01s          
  |+++++++++++++++++++++++++++++++++++++             | 74% ~01s          
  |++++++++++++++++++++++++++++++++++++++            | 75% ~01s          
  |+++++++++++++++++++++++++++++++++++++++           | 76% ~01s          
  |+++++++++++++++++++++++++++++++++++++++           | 77% ~01s          
  |++++++++++++++++++++++++++++++++++++++++          | 78% ~01s          
  |++++++++++++++++++++++++++++++++++++++++          | 79% ~01s          
  |+++++++++++++++++++++++++++++++++++++++++         | 80% ~01s          
  |+++++++++++++++++++++++++++++++++++++++++         | 82% ~01s          
  |++++++++++++++++++++++++++++++++++++++++++        | 83% ~01s          
  |++++++++++++++++++++++++++++++++++++++++++        | 84% ~01s          
  |+++++++++++++++++++++++++++++++++++++++++++       | 85% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++       | 86% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++      | 87% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++++     | 88% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++++     | 89% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++++    | 90% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++++    | 91% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++++++   | 92% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++++++   | 93% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++++++  | 95% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++++++  | 96% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++++++++ | 97% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++++++++ | 98% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++++++++| 99% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=03s  
Calculating cluster 5

  |                                                  | 0 % ~calculating  
  |+                                                 | 1 % ~03s          
  |++                                                | 2 % ~03s          
  |++                                                | 3 % ~03s          
  |+++                                               | 4 % ~03s          
  |+++                                               | 5 % ~03s          
  |++++                                              | 6 % ~03s          
  |++++                                              | 7 % ~03s          
  |+++++                                             | 8 % ~02s          
  |+++++                                             | 9 % ~02s          
  |++++++                                            | 10% ~02s          
  |++++++                                            | 11% ~02s          
  |+++++++                                           | 12% ~02s          
  |+++++++                                           | 13% ~02s          
  |++++++++                                          | 14% ~02s          
  |++++++++                                          | 15% ~02s          
  |+++++++++                                         | 16% ~02s          
  |+++++++++                                         | 18% ~02s          
  |++++++++++                                        | 19% ~02s          
  |++++++++++                                        | 20% ~02s          
  |+++++++++++                                       | 21% ~02s          
  |+++++++++++                                       | 22% ~02s          
  |++++++++++++                                      | 23% ~02s          
  |++++++++++++                                      | 24% ~02s          
  |+++++++++++++                                     | 25% ~02s          
  |+++++++++++++                                     | 26% ~02s          
  |++++++++++++++                                    | 27% ~02s          
  |++++++++++++++                                    | 28% ~02s          
  |+++++++++++++++                                   | 29% ~02s          
  |+++++++++++++++                                   | 30% ~02s          
  |++++++++++++++++                                  | 31% ~02s          
  |++++++++++++++++                                  | 32% ~02s          
  |+++++++++++++++++                                 | 33% ~02s          
  |++++++++++++++++++                                | 34% ~02s          
  |++++++++++++++++++                                | 35% ~02s          
  |+++++++++++++++++++                               | 36% ~02s          
  |+++++++++++++++++++                               | 37% ~02s          
  |++++++++++++++++++++                              | 38% ~02s          
  |++++++++++++++++++++                              | 39% ~02s          
  |+++++++++++++++++++++                             | 40% ~02s          
  |+++++++++++++++++++++                             | 41% ~02s          
  |++++++++++++++++++++++                            | 42% ~02s          
  |++++++++++++++++++++++                            | 43% ~01s          
  |+++++++++++++++++++++++                           | 44% ~01s          
  |+++++++++++++++++++++++                           | 45% ~01s          
  |++++++++++++++++++++++++                          | 46% ~01s          
  |++++++++++++++++++++++++                          | 47% ~01s          
  |+++++++++++++++++++++++++                         | 48% ~01s          
  |+++++++++++++++++++++++++                         | 49% ~01s          
  |++++++++++++++++++++++++++                        | 51% ~01s          
  |++++++++++++++++++++++++++                        | 52% ~01s          
  |+++++++++++++++++++++++++++                       | 53% ~01s          
  |+++++++++++++++++++++++++++                       | 54% ~01s          
  |++++++++++++++++++++++++++++                      | 55% ~01s          
  |++++++++++++++++++++++++++++                      | 56% ~01s          
  |+++++++++++++++++++++++++++++                     | 57% ~01s          
  |+++++++++++++++++++++++++++++                     | 58% ~01s          
  |++++++++++++++++++++++++++++++                    | 59% ~01s          
  |++++++++++++++++++++++++++++++                    | 60% ~01s          
  |+++++++++++++++++++++++++++++++                   | 61% ~01s          
  |+++++++++++++++++++++++++++++++                   | 62% ~01s          
  |++++++++++++++++++++++++++++++++                  | 63% ~01s          
  |++++++++++++++++++++++++++++++++                  | 64% ~01s          
  |+++++++++++++++++++++++++++++++++                 | 65% ~01s          
  |+++++++++++++++++++++++++++++++++                 | 66% ~01s          
  |++++++++++++++++++++++++++++++++++                | 67% ~01s          
  |+++++++++++++++++++++++++++++++++++               | 68% ~01s          
  |+++++++++++++++++++++++++++++++++++               | 69% ~01s          
  |++++++++++++++++++++++++++++++++++++              | 70% ~01s          
  |++++++++++++++++++++++++++++++++++++              | 71% ~01s          
  |+++++++++++++++++++++++++++++++++++++             | 72% ~01s          
  |+++++++++++++++++++++++++++++++++++++             | 73% ~01s          
  |++++++++++++++++++++++++++++++++++++++            | 74% ~01s          
  |++++++++++++++++++++++++++++++++++++++            | 75% ~01s          
  |+++++++++++++++++++++++++++++++++++++++           | 76% ~01s          
  |+++++++++++++++++++++++++++++++++++++++           | 77% ~01s          
  |++++++++++++++++++++++++++++++++++++++++          | 78% ~01s          
  |++++++++++++++++++++++++++++++++++++++++          | 79% ~01s          
  |+++++++++++++++++++++++++++++++++++++++++         | 80% ~01s          
  |+++++++++++++++++++++++++++++++++++++++++         | 81% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++        | 82% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++        | 84% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++       | 85% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++       | 86% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++      | 87% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++      | 88% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++++     | 89% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++++     | 90% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++++    | 91% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++++    | 92% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++++++   | 93% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++++++   | 94% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++++++  | 95% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++++++  | 96% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++++++++ | 97% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++++++++ | 98% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++++++++| 99% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=03s  
Calculating cluster 6

  |                                                  | 0 % ~calculating  
  |+                                                 | 1 % ~03s          
  |++                                                | 3 % ~03s          
  |++                                                | 4 % ~03s          
  |+++                                               | 5 % ~03s          
  |++++                                              | 7 % ~03s          
  |++++                                              | 8 % ~02s          
  |+++++                                             | 9 % ~02s          
  |++++++                                            | 11% ~02s          
  |++++++                                            | 12% ~02s          
  |+++++++                                           | 13% ~02s          
  |++++++++                                          | 15% ~02s          
  |++++++++                                          | 16% ~02s          
  |+++++++++                                         | 17% ~02s          
  |++++++++++                                        | 19% ~02s          
  |++++++++++                                        | 20% ~02s          
  |+++++++++++                                       | 21% ~02s          
  |++++++++++++                                      | 23% ~02s          
  |++++++++++++                                      | 24% ~02s          
  |+++++++++++++                                     | 25% ~02s          
  |++++++++++++++                                    | 27% ~02s          
  |++++++++++++++                                    | 28% ~02s          
  |+++++++++++++++                                   | 29% ~02s          
  |++++++++++++++++                                  | 31% ~02s          
  |++++++++++++++++                                  | 32% ~02s          
  |+++++++++++++++++                                 | 33% ~02s          
  |++++++++++++++++++                                | 35% ~02s          
  |++++++++++++++++++                                | 36% ~02s          
  |+++++++++++++++++++                               | 37% ~02s          
  |++++++++++++++++++++                              | 39% ~02s          
  |++++++++++++++++++++                              | 40% ~02s          
  |+++++++++++++++++++++                             | 41% ~02s          
  |++++++++++++++++++++++                            | 43% ~02s          
  |++++++++++++++++++++++                            | 44% ~02s          
  |+++++++++++++++++++++++                           | 45% ~02s          
  |++++++++++++++++++++++++                          | 47% ~01s          
  |++++++++++++++++++++++++                          | 48% ~01s          
  |+++++++++++++++++++++++++                         | 49% ~01s          
  |++++++++++++++++++++++++++                        | 51% ~01s          
  |++++++++++++++++++++++++++                        | 52% ~01s          
  |+++++++++++++++++++++++++++                       | 53% ~01s          
  |++++++++++++++++++++++++++++                      | 55% ~01s          
  |++++++++++++++++++++++++++++                      | 56% ~01s          
  |+++++++++++++++++++++++++++++                     | 57% ~01s          
  |++++++++++++++++++++++++++++++                    | 59% ~01s          
  |++++++++++++++++++++++++++++++                    | 60% ~01s          
  |+++++++++++++++++++++++++++++++                   | 61% ~01s          
  |++++++++++++++++++++++++++++++++                  | 63% ~01s          
  |++++++++++++++++++++++++++++++++                  | 64% ~01s          
  |+++++++++++++++++++++++++++++++++                 | 65% ~01s          
  |++++++++++++++++++++++++++++++++++                | 67% ~01s          
  |++++++++++++++++++++++++++++++++++                | 68% ~01s          
  |+++++++++++++++++++++++++++++++++++               | 69% ~01s          
  |++++++++++++++++++++++++++++++++++++              | 71% ~01s          
  |++++++++++++++++++++++++++++++++++++              | 72% ~01s          
  |+++++++++++++++++++++++++++++++++++++             | 73% ~01s          
  |++++++++++++++++++++++++++++++++++++++            | 75% ~01s          
  |++++++++++++++++++++++++++++++++++++++            | 76% ~01s          
  |+++++++++++++++++++++++++++++++++++++++           | 77% ~01s          
  |++++++++++++++++++++++++++++++++++++++++          | 79% ~01s          
  |++++++++++++++++++++++++++++++++++++++++          | 80% ~01s          
  |+++++++++++++++++++++++++++++++++++++++++         | 81% ~01s          
  |++++++++++++++++++++++++++++++++++++++++++        | 83% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++        | 84% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++       | 85% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++      | 87% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++      | 88% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++++     | 89% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++++    | 91% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++++    | 92% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++++++   | 93% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++++++  | 95% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++++++  | 96% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++++++++ | 97% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++++++++| 99% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=03s  
Calculating cluster 7

  |                                                  | 0 % ~calculating  
  |+                                                 | 1 % ~06s          
  |++                                                | 2 % ~06s          
  |++                                                | 3 % ~06s          
  |+++                                               | 4 % ~06s          
  |+++                                               | 5 % ~06s          
  |++++                                              | 6 % ~05s          
  |++++                                              | 7 % ~05s          
  |+++++                                             | 8 % ~05s          
  |+++++                                             | 9 % ~05s          
  |++++++                                            | 10% ~05s          
  |++++++                                            | 11% ~05s          
  |+++++++                                           | 12% ~05s          
  |+++++++                                           | 13% ~05s          
  |++++++++                                          | 14% ~05s          
  |++++++++                                          | 15% ~05s          
  |+++++++++                                         | 16% ~05s          
  |+++++++++                                         | 18% ~05s          
  |++++++++++                                        | 19% ~05s          
  |++++++++++                                        | 20% ~05s          
  |+++++++++++                                       | 21% ~05s          
  |+++++++++++                                       | 22% ~05s          
  |++++++++++++                                      | 23% ~05s          
  |++++++++++++                                      | 24% ~05s          
  |+++++++++++++                                     | 25% ~04s          
  |+++++++++++++                                     | 26% ~04s          
  |++++++++++++++                                    | 27% ~04s          
  |++++++++++++++                                    | 28% ~04s          
  |+++++++++++++++                                   | 29% ~04s          
  |+++++++++++++++                                   | 30% ~04s          
  |++++++++++++++++                                  | 31% ~04s          
  |++++++++++++++++                                  | 32% ~04s          
  |+++++++++++++++++                                 | 33% ~04s          
  |++++++++++++++++++                                | 34% ~04s          
  |++++++++++++++++++                                | 35% ~04s          
  |+++++++++++++++++++                               | 36% ~04s          
  |+++++++++++++++++++                               | 37% ~04s          
  |++++++++++++++++++++                              | 38% ~04s          
  |++++++++++++++++++++                              | 39% ~04s          
  |+++++++++++++++++++++                             | 40% ~04s          
  |+++++++++++++++++++++                             | 41% ~03s          
  |++++++++++++++++++++++                            | 42% ~03s          
  |++++++++++++++++++++++                            | 43% ~03s          
  |+++++++++++++++++++++++                           | 44% ~03s          
  |+++++++++++++++++++++++                           | 45% ~03s          
  |++++++++++++++++++++++++                          | 46% ~03s          
  |++++++++++++++++++++++++                          | 47% ~03s          
  |+++++++++++++++++++++++++                         | 48% ~03s          
  |+++++++++++++++++++++++++                         | 49% ~03s          
  |++++++++++++++++++++++++++                        | 51% ~03s          
  |++++++++++++++++++++++++++                        | 52% ~03s          
  |+++++++++++++++++++++++++++                       | 53% ~03s          
  |+++++++++++++++++++++++++++                       | 54% ~03s          
  |++++++++++++++++++++++++++++                      | 55% ~03s          
  |++++++++++++++++++++++++++++                      | 56% ~03s          
  |+++++++++++++++++++++++++++++                     | 57% ~03s          
  |+++++++++++++++++++++++++++++                     | 58% ~03s          
  |++++++++++++++++++++++++++++++                    | 59% ~02s          
  |++++++++++++++++++++++++++++++                    | 60% ~02s          
  |+++++++++++++++++++++++++++++++                   | 61% ~02s          
  |+++++++++++++++++++++++++++++++                   | 62% ~02s          
  |++++++++++++++++++++++++++++++++                  | 63% ~02s          
  |++++++++++++++++++++++++++++++++                  | 64% ~02s          
  |+++++++++++++++++++++++++++++++++                 | 65% ~02s          
  |+++++++++++++++++++++++++++++++++                 | 66% ~02s          
  |++++++++++++++++++++++++++++++++++                | 67% ~02s          
  |+++++++++++++++++++++++++++++++++++               | 68% ~02s          
  |+++++++++++++++++++++++++++++++++++               | 69% ~02s          
  |++++++++++++++++++++++++++++++++++++              | 70% ~02s          
  |++++++++++++++++++++++++++++++++++++              | 71% ~02s          
  |+++++++++++++++++++++++++++++++++++++             | 72% ~02s          
  |+++++++++++++++++++++++++++++++++++++             | 73% ~02s          
  |++++++++++++++++++++++++++++++++++++++            | 74% ~02s          
  |++++++++++++++++++++++++++++++++++++++            | 75% ~01s          
  |+++++++++++++++++++++++++++++++++++++++           | 76% ~01s          
  |+++++++++++++++++++++++++++++++++++++++           | 77% ~01s          
  |++++++++++++++++++++++++++++++++++++++++          | 78% ~01s          
  |++++++++++++++++++++++++++++++++++++++++          | 79% ~01s          
  |+++++++++++++++++++++++++++++++++++++++++         | 80% ~01s          
  |+++++++++++++++++++++++++++++++++++++++++         | 81% ~01s          
  |++++++++++++++++++++++++++++++++++++++++++        | 82% ~01s          
  |++++++++++++++++++++++++++++++++++++++++++        | 84% ~01s          
  |+++++++++++++++++++++++++++++++++++++++++++       | 85% ~01s          
  |+++++++++++++++++++++++++++++++++++++++++++       | 86% ~01s          
  |++++++++++++++++++++++++++++++++++++++++++++      | 87% ~01s          
  |++++++++++++++++++++++++++++++++++++++++++++      | 88% ~01s          
  |+++++++++++++++++++++++++++++++++++++++++++++     | 89% ~01s          
  |+++++++++++++++++++++++++++++++++++++++++++++     | 90% ~01s          
  |++++++++++++++++++++++++++++++++++++++++++++++    | 91% ~01s          
  |++++++++++++++++++++++++++++++++++++++++++++++    | 92% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++++++   | 93% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++++++   | 94% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++++++  | 95% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++++++  | 96% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++++++++ | 97% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++++++++ | 98% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++++++++| 99% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=06s  
Calculating cluster 8

  |                                                  | 0 % ~calculating  
  |+                                                 | 1 % ~04s          
  |++                                                | 2 % ~04s          
  |++                                                | 3 % ~04s          
  |+++                                               | 4 % ~04s          
  |+++                                               | 5 % ~04s          
  |++++                                              | 6 % ~03s          
  |++++                                              | 7 % ~03s          
  |+++++                                             | 8 % ~03s          
  |+++++                                             | 9 % ~03s          
  |++++++                                            | 10% ~03s          
  |++++++                                            | 11% ~03s          
  |+++++++                                           | 12% ~03s          
  |+++++++                                           | 13% ~03s          
  |++++++++                                          | 14% ~03s          
  |++++++++                                          | 15% ~03s          
  |+++++++++                                         | 16% ~03s          
  |+++++++++                                         | 17% ~03s          
  |++++++++++                                        | 18% ~03s          
  |++++++++++                                        | 19% ~03s          
  |+++++++++++                                       | 20% ~03s          
  |+++++++++++                                       | 21% ~03s          
  |++++++++++++                                      | 22% ~03s          
  |++++++++++++                                      | 23% ~03s          
  |+++++++++++++                                     | 24% ~03s          
  |+++++++++++++                                     | 26% ~03s          
  |++++++++++++++                                    | 27% ~03s          
  |++++++++++++++                                    | 28% ~03s          
  |+++++++++++++++                                   | 29% ~03s          
  |+++++++++++++++                                   | 30% ~03s          
  |++++++++++++++++                                  | 31% ~03s          
  |++++++++++++++++                                  | 32% ~03s          
  |+++++++++++++++++                                 | 33% ~03s          
  |+++++++++++++++++                                 | 34% ~02s          
  |++++++++++++++++++                                | 35% ~02s          
  |++++++++++++++++++                                | 36% ~02s          
  |+++++++++++++++++++                               | 37% ~02s          
  |+++++++++++++++++++                               | 38% ~02s          
  |++++++++++++++++++++                              | 39% ~02s          
  |++++++++++++++++++++                              | 40% ~02s          
  |+++++++++++++++++++++                             | 41% ~02s          
  |+++++++++++++++++++++                             | 42% ~02s          
  |++++++++++++++++++++++                            | 43% ~02s          
  |++++++++++++++++++++++                            | 44% ~02s          
  |+++++++++++++++++++++++                           | 45% ~02s          
  |+++++++++++++++++++++++                           | 46% ~02s          
  |++++++++++++++++++++++++                          | 47% ~02s          
  |++++++++++++++++++++++++                          | 48% ~02s          
  |+++++++++++++++++++++++++                         | 49% ~02s          
  |+++++++++++++++++++++++++                         | 50% ~02s          
  |++++++++++++++++++++++++++                        | 51% ~02s          
  |+++++++++++++++++++++++++++                       | 52% ~02s          
  |+++++++++++++++++++++++++++                       | 53% ~02s          
  |++++++++++++++++++++++++++++                      | 54% ~02s          
  |++++++++++++++++++++++++++++                      | 55% ~02s          
  |+++++++++++++++++++++++++++++                     | 56% ~02s          
  |+++++++++++++++++++++++++++++                     | 57% ~02s          
  |++++++++++++++++++++++++++++++                    | 58% ~02s          
  |++++++++++++++++++++++++++++++                    | 59% ~02s          
  |+++++++++++++++++++++++++++++++                   | 60% ~01s          
  |+++++++++++++++++++++++++++++++                   | 61% ~01s          
  |++++++++++++++++++++++++++++++++                  | 62% ~01s          
  |++++++++++++++++++++++++++++++++                  | 63% ~01s          
  |+++++++++++++++++++++++++++++++++                 | 64% ~01s          
  |+++++++++++++++++++++++++++++++++                 | 65% ~01s          
  |++++++++++++++++++++++++++++++++++                | 66% ~01s          
  |++++++++++++++++++++++++++++++++++                | 67% ~01s          
  |+++++++++++++++++++++++++++++++++++               | 68% ~01s          
  |+++++++++++++++++++++++++++++++++++               | 69% ~01s          
  |++++++++++++++++++++++++++++++++++++              | 70% ~01s          
  |++++++++++++++++++++++++++++++++++++              | 71% ~01s          
  |+++++++++++++++++++++++++++++++++++++             | 72% ~01s          
  |+++++++++++++++++++++++++++++++++++++             | 73% ~01s          
  |++++++++++++++++++++++++++++++++++++++            | 74% ~01s          
  |++++++++++++++++++++++++++++++++++++++            | 76% ~01s          
  |+++++++++++++++++++++++++++++++++++++++           | 77% ~01s          
  |+++++++++++++++++++++++++++++++++++++++           | 78% ~01s          
  |++++++++++++++++++++++++++++++++++++++++          | 79% ~01s          
  |++++++++++++++++++++++++++++++++++++++++          | 80% ~01s          
  |+++++++++++++++++++++++++++++++++++++++++         | 81% ~01s          
  |+++++++++++++++++++++++++++++++++++++++++         | 82% ~01s          
  |++++++++++++++++++++++++++++++++++++++++++        | 83% ~01s          
  |++++++++++++++++++++++++++++++++++++++++++        | 84% ~01s          
  |+++++++++++++++++++++++++++++++++++++++++++       | 85% ~01s          
  |+++++++++++++++++++++++++++++++++++++++++++       | 86% ~01s          
  |++++++++++++++++++++++++++++++++++++++++++++      | 87% ~01s          
  |++++++++++++++++++++++++++++++++++++++++++++      | 88% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++++     | 89% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++++     | 90% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++++    | 91% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++++    | 92% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++++++   | 93% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++++++   | 94% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++++++  | 95% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++++++  | 96% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++++++++ | 97% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++++++++ | 98% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++++++++| 99% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=04s  
Calculating cluster 9

  |                                                  | 0 % ~calculating  
  |+                                                 | 1 % ~03s          
  |++                                                | 2 % ~03s          
  |++                                                | 3 % ~03s          
  |+++                                               | 4 % ~03s          
  |+++                                               | 6 % ~03s          
  |++++                                              | 7 % ~03s          
  |++++                                              | 8 % ~03s          
  |+++++                                             | 9 % ~03s          
  |++++++                                            | 10% ~03s          
  |++++++                                            | 11% ~03s          
  |+++++++                                           | 12% ~03s          
  |+++++++                                           | 13% ~03s          
  |++++++++                                          | 15% ~03s          
  |++++++++                                          | 16% ~03s          
  |+++++++++                                         | 17% ~03s          
  |+++++++++                                         | 18% ~03s          
  |++++++++++                                        | 19% ~03s          
  |+++++++++++                                       | 20% ~03s          
  |+++++++++++                                       | 21% ~02s          
  |++++++++++++                                      | 22% ~02s          
  |++++++++++++                                      | 24% ~02s          
  |+++++++++++++                                     | 25% ~02s          
  |+++++++++++++                                     | 26% ~02s          
  |++++++++++++++                                    | 27% ~02s          
  |+++++++++++++++                                   | 28% ~02s          
  |+++++++++++++++                                   | 29% ~02s          
  |++++++++++++++++                                  | 30% ~02s          
  |++++++++++++++++                                  | 31% ~02s          
  |+++++++++++++++++                                 | 33% ~02s          
  |+++++++++++++++++                                 | 34% ~02s          
  |++++++++++++++++++                                | 35% ~02s          
  |++++++++++++++++++                                | 36% ~02s          
  |+++++++++++++++++++                               | 37% ~02s          
  |++++++++++++++++++++                              | 38% ~02s          
  |++++++++++++++++++++                              | 39% ~02s          
  |+++++++++++++++++++++                             | 40% ~02s          
  |+++++++++++++++++++++                             | 42% ~02s          
  |++++++++++++++++++++++                            | 43% ~02s          
  |++++++++++++++++++++++                            | 44% ~02s          
  |+++++++++++++++++++++++                           | 45% ~02s          
  |++++++++++++++++++++++++                          | 46% ~02s          
  |++++++++++++++++++++++++                          | 47% ~02s          
  |+++++++++++++++++++++++++                         | 48% ~02s          
  |+++++++++++++++++++++++++                         | 49% ~02s          
  |++++++++++++++++++++++++++                        | 51% ~02s          
  |++++++++++++++++++++++++++                        | 52% ~02s          
  |+++++++++++++++++++++++++++                       | 53% ~02s          
  |+++++++++++++++++++++++++++                       | 54% ~01s          
  |++++++++++++++++++++++++++++                      | 55% ~01s          
  |+++++++++++++++++++++++++++++                     | 56% ~01s          
  |+++++++++++++++++++++++++++++                     | 57% ~01s          
  |++++++++++++++++++++++++++++++                    | 58% ~01s          
  |++++++++++++++++++++++++++++++                    | 60% ~01s          
  |+++++++++++++++++++++++++++++++                   | 61% ~01s          
  |+++++++++++++++++++++++++++++++                   | 62% ~01s          
  |++++++++++++++++++++++++++++++++                  | 63% ~01s          
  |+++++++++++++++++++++++++++++++++                 | 64% ~01s          
  |+++++++++++++++++++++++++++++++++                 | 65% ~01s          
  |++++++++++++++++++++++++++++++++++                | 66% ~01s          
  |++++++++++++++++++++++++++++++++++                | 67% ~01s          
  |+++++++++++++++++++++++++++++++++++               | 69% ~01s          
  |+++++++++++++++++++++++++++++++++++               | 70% ~01s          
  |++++++++++++++++++++++++++++++++++++              | 71% ~01s          
  |++++++++++++++++++++++++++++++++++++              | 72% ~01s          
  |+++++++++++++++++++++++++++++++++++++             | 73% ~01s          
  |++++++++++++++++++++++++++++++++++++++            | 74% ~01s          
  |++++++++++++++++++++++++++++++++++++++            | 75% ~01s          
  |+++++++++++++++++++++++++++++++++++++++           | 76% ~01s          
  |+++++++++++++++++++++++++++++++++++++++           | 78% ~01s          
  |++++++++++++++++++++++++++++++++++++++++          | 79% ~01s          
  |++++++++++++++++++++++++++++++++++++++++          | 80% ~01s          
  |+++++++++++++++++++++++++++++++++++++++++         | 81% ~01s          
  |++++++++++++++++++++++++++++++++++++++++++        | 82% ~01s          
  |++++++++++++++++++++++++++++++++++++++++++        | 83% ~01s          
  |+++++++++++++++++++++++++++++++++++++++++++       | 84% ~01s          
  |+++++++++++++++++++++++++++++++++++++++++++       | 85% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++      | 87% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++      | 88% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++++     | 89% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++++     | 90% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++++    | 91% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++++++   | 92% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++++++   | 93% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++++++  | 94% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++++++  | 96% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++++++++ | 97% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++++++++ | 98% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++++++++| 99% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=03s  
Calculating cluster 10

  |                                                  | 0 % ~calculating  
  |+                                                 | 1 % ~03s          
  |++                                                | 2 % ~03s          
  |++                                                | 4 % ~03s          
  |+++                                               | 5 % ~03s          
  |++++                                              | 6 % ~03s          
  |++++                                              | 8 % ~03s          
  |+++++                                             | 9 % ~03s          
  |+++++                                             | 10% ~02s          
  |++++++                                            | 11% ~02s          
  |+++++++                                           | 12% ~02s          
  |+++++++                                           | 14% ~02s          
  |++++++++                                          | 15% ~02s          
  |+++++++++                                         | 16% ~02s          
  |+++++++++                                         | 18% ~02s          
  |++++++++++                                        | 19% ~02s          
  |++++++++++                                        | 20% ~02s          
  |+++++++++++                                       | 21% ~02s          
  |++++++++++++                                      | 22% ~02s          
  |++++++++++++                                      | 24% ~02s          
  |+++++++++++++                                     | 25% ~02s          
  |++++++++++++++                                    | 26% ~02s          
  |++++++++++++++                                    | 28% ~02s          
  |+++++++++++++++                                   | 29% ~02s          
  |+++++++++++++++                                   | 30% ~02s          
  |++++++++++++++++                                  | 31% ~02s          
  |+++++++++++++++++                                 | 32% ~02s          
  |+++++++++++++++++                                 | 34% ~02s          
  |++++++++++++++++++                                | 35% ~02s          
  |+++++++++++++++++++                               | 36% ~02s          
  |+++++++++++++++++++                               | 38% ~02s          
  |++++++++++++++++++++                              | 39% ~02s          
  |++++++++++++++++++++                              | 40% ~02s          
  |+++++++++++++++++++++                             | 41% ~02s          
  |++++++++++++++++++++++                            | 42% ~02s          
  |++++++++++++++++++++++                            | 44% ~02s          
  |+++++++++++++++++++++++                           | 45% ~01s          
  |++++++++++++++++++++++++                          | 46% ~01s          
  |++++++++++++++++++++++++                          | 48% ~01s          
  |+++++++++++++++++++++++++                         | 49% ~01s          
  |+++++++++++++++++++++++++                         | 50% ~01s          
  |++++++++++++++++++++++++++                        | 51% ~01s          
  |+++++++++++++++++++++++++++                       | 52% ~01s          
  |+++++++++++++++++++++++++++                       | 54% ~01s          
  |++++++++++++++++++++++++++++                      | 55% ~01s          
  |+++++++++++++++++++++++++++++                     | 56% ~01s          
  |+++++++++++++++++++++++++++++                     | 58% ~01s          
  |++++++++++++++++++++++++++++++                    | 59% ~01s          
  |++++++++++++++++++++++++++++++                    | 60% ~01s          
  |+++++++++++++++++++++++++++++++                   | 61% ~01s          
  |++++++++++++++++++++++++++++++++                  | 62% ~01s          
  |++++++++++++++++++++++++++++++++                  | 64% ~01s          
  |+++++++++++++++++++++++++++++++++                 | 65% ~01s          
  |++++++++++++++++++++++++++++++++++                | 66% ~01s          
  |++++++++++++++++++++++++++++++++++                | 68% ~01s          
  |+++++++++++++++++++++++++++++++++++               | 69% ~01s          
  |+++++++++++++++++++++++++++++++++++               | 70% ~01s          
  |++++++++++++++++++++++++++++++++++++              | 71% ~01s          
  |+++++++++++++++++++++++++++++++++++++             | 72% ~01s          
  |+++++++++++++++++++++++++++++++++++++             | 74% ~01s          
  |++++++++++++++++++++++++++++++++++++++            | 75% ~01s          
  |+++++++++++++++++++++++++++++++++++++++           | 76% ~01s          
  |+++++++++++++++++++++++++++++++++++++++           | 78% ~01s          
  |++++++++++++++++++++++++++++++++++++++++          | 79% ~01s          
  |++++++++++++++++++++++++++++++++++++++++          | 80% ~01s          
  |+++++++++++++++++++++++++++++++++++++++++         | 81% ~01s          
  |++++++++++++++++++++++++++++++++++++++++++        | 82% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++        | 84% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++       | 85% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++      | 86% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++      | 88% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++++     | 89% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++++     | 90% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++++    | 91% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++++++   | 92% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++++++   | 94% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++++++  | 95% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++++++++ | 96% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++++++++ | 98% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++++++++| 99% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=03s  
Calculating cluster 11

  |                                                  | 0 % ~calculating  
  |+                                                 | 1 % ~07s          
  |++                                                | 2 % ~07s          
  |++                                                | 3 % ~06s          
  |+++                                               | 4 % ~06s          
  |+++                                               | 5 % ~06s          
  |++++                                              | 6 % ~06s          
  |++++                                              | 7 % ~06s          
  |+++++                                             | 8 % ~06s          
  |+++++                                             | 9 % ~05s          
  |++++++                                            | 10% ~05s          
  |++++++                                            | 11% ~05s          
  |+++++++                                           | 12% ~05s          
  |+++++++                                           | 14% ~05s          
  |++++++++                                          | 15% ~05s          
  |++++++++                                          | 16% ~05s          
  |+++++++++                                         | 17% ~05s          
  |+++++++++                                         | 18% ~05s          
  |++++++++++                                        | 19% ~05s          
  |++++++++++                                        | 20% ~05s          
  |+++++++++++                                       | 21% ~05s          
  |+++++++++++                                       | 22% ~05s          
  |++++++++++++                                      | 23% ~05s          
  |++++++++++++                                      | 24% ~05s          
  |+++++++++++++                                     | 25% ~05s          
  |++++++++++++++                                    | 26% ~05s          
  |++++++++++++++                                    | 27% ~05s          
  |+++++++++++++++                                   | 28% ~05s          
  |+++++++++++++++                                   | 29% ~05s          
  |++++++++++++++++                                  | 30% ~04s          
  |++++++++++++++++                                  | 31% ~04s          
  |+++++++++++++++++                                 | 32% ~04s          
  |+++++++++++++++++                                 | 33% ~04s          
  |++++++++++++++++++                                | 34% ~04s          
  |++++++++++++++++++                                | 35% ~04s          
  |+++++++++++++++++++                               | 36% ~04s          
  |+++++++++++++++++++                               | 38% ~04s          
  |++++++++++++++++++++                              | 39% ~04s          
  |++++++++++++++++++++                              | 40% ~04s          
  |+++++++++++++++++++++                             | 41% ~04s          
  |+++++++++++++++++++++                             | 42% ~04s          
  |++++++++++++++++++++++                            | 43% ~03s          
  |++++++++++++++++++++++                            | 44% ~03s          
  |+++++++++++++++++++++++                           | 45% ~03s          
  |+++++++++++++++++++++++                           | 46% ~03s          
  |++++++++++++++++++++++++                          | 47% ~03s          
  |++++++++++++++++++++++++                          | 48% ~03s          
  |+++++++++++++++++++++++++                         | 49% ~03s          
  |+++++++++++++++++++++++++                         | 50% ~03s          
  |++++++++++++++++++++++++++                        | 51% ~03s          
  |+++++++++++++++++++++++++++                       | 52% ~03s          
  |+++++++++++++++++++++++++++                       | 53% ~03s          
  |++++++++++++++++++++++++++++                      | 54% ~03s          
  |++++++++++++++++++++++++++++                      | 55% ~03s          
  |+++++++++++++++++++++++++++++                     | 56% ~03s          
  |+++++++++++++++++++++++++++++                     | 57% ~03s          
  |++++++++++++++++++++++++++++++                    | 58% ~03s          
  |++++++++++++++++++++++++++++++                    | 59% ~02s          
  |+++++++++++++++++++++++++++++++                   | 60% ~02s          
  |+++++++++++++++++++++++++++++++                   | 61% ~02s          
  |++++++++++++++++++++++++++++++++                  | 62% ~02s          
  |++++++++++++++++++++++++++++++++                  | 64% ~02s          
  |+++++++++++++++++++++++++++++++++                 | 65% ~02s          
  |+++++++++++++++++++++++++++++++++                 | 66% ~02s          
  |++++++++++++++++++++++++++++++++++                | 67% ~02s          
  |++++++++++++++++++++++++++++++++++                | 68% ~02s          
  |+++++++++++++++++++++++++++++++++++               | 69% ~02s          
  |+++++++++++++++++++++++++++++++++++               | 70% ~02s          
  |++++++++++++++++++++++++++++++++++++              | 71% ~02s          
  |++++++++++++++++++++++++++++++++++++              | 72% ~02s          
  |+++++++++++++++++++++++++++++++++++++             | 73% ~02s          
  |+++++++++++++++++++++++++++++++++++++             | 74% ~02s          
  |++++++++++++++++++++++++++++++++++++++            | 75% ~02s          
  |+++++++++++++++++++++++++++++++++++++++           | 76% ~01s          
  |+++++++++++++++++++++++++++++++++++++++           | 77% ~01s          
  |++++++++++++++++++++++++++++++++++++++++          | 78% ~01s          
  |++++++++++++++++++++++++++++++++++++++++          | 79% ~01s          
  |+++++++++++++++++++++++++++++++++++++++++         | 80% ~01s          
  |+++++++++++++++++++++++++++++++++++++++++         | 81% ~01s          
  |++++++++++++++++++++++++++++++++++++++++++        | 82% ~01s          
  |++++++++++++++++++++++++++++++++++++++++++        | 83% ~01s          
  |+++++++++++++++++++++++++++++++++++++++++++       | 84% ~01s          
  |+++++++++++++++++++++++++++++++++++++++++++       | 85% ~01s          
  |++++++++++++++++++++++++++++++++++++++++++++      | 86% ~01s          
  |++++++++++++++++++++++++++++++++++++++++++++      | 88% ~01s          
  |+++++++++++++++++++++++++++++++++++++++++++++     | 89% ~01s          
  |+++++++++++++++++++++++++++++++++++++++++++++     | 90% ~01s          
  |++++++++++++++++++++++++++++++++++++++++++++++    | 91% ~01s          
  |++++++++++++++++++++++++++++++++++++++++++++++    | 92% ~01s          
  |+++++++++++++++++++++++++++++++++++++++++++++++   | 93% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++++++   | 94% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++++++  | 95% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++++++  | 96% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++++++++ | 97% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++++++++ | 98% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++++++++| 99% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=06s  
Calculating cluster 12

  |                                                  | 0 % ~calculating  
  |+                                                 | 1 % ~07s          
  |++                                                | 2 % ~07s          
  |++                                                | 3 % ~07s          
  |+++                                               | 4 % ~07s          
  |+++                                               | 5 % ~07s          
  |++++                                              | 6 % ~07s          
  |++++                                              | 7 % ~07s          
  |+++++                                             | 8 % ~07s          
  |+++++                                             | 9 % ~07s          
  |++++++                                            | 10% ~07s          
  |++++++                                            | 11% ~07s          
  |+++++++                                           | 12% ~06s          
  |+++++++                                           | 13% ~06s          
  |++++++++                                          | 14% ~06s          
  |++++++++                                          | 15% ~06s          
  |+++++++++                                         | 16% ~06s          
  |+++++++++                                         | 17% ~06s          
  |++++++++++                                        | 18% ~06s          
  |++++++++++                                        | 19% ~06s          
  |+++++++++++                                       | 20% ~06s          
  |+++++++++++                                       | 21% ~06s          
  |++++++++++++                                      | 22% ~06s          
  |++++++++++++                                      | 23% ~05s          
  |+++++++++++++                                     | 24% ~05s          
  |+++++++++++++                                     | 25% ~05s          
  |++++++++++++++                                    | 26% ~05s          
  |++++++++++++++                                    | 27% ~05s          
  |+++++++++++++++                                   | 28% ~05s          
  |+++++++++++++++                                   | 29% ~05s          
  |++++++++++++++++                                  | 30% ~05s          
  |++++++++++++++++                                  | 31% ~05s          
  |+++++++++++++++++                                 | 32% ~05s          
  |+++++++++++++++++                                 | 33% ~05s          
  |++++++++++++++++++                                | 34% ~05s          
  |++++++++++++++++++                                | 35% ~05s          
  |+++++++++++++++++++                               | 36% ~05s          
  |+++++++++++++++++++                               | 37% ~04s          
  |++++++++++++++++++++                              | 38% ~04s          
  |++++++++++++++++++++                              | 39% ~04s          
  |+++++++++++++++++++++                             | 40% ~04s          
  |+++++++++++++++++++++                             | 41% ~04s          
  |++++++++++++++++++++++                            | 42% ~04s          
  |++++++++++++++++++++++                            | 43% ~04s          
  |+++++++++++++++++++++++                           | 44% ~04s          
  |+++++++++++++++++++++++                           | 45% ~04s          
  |++++++++++++++++++++++++                          | 46% ~04s          
  |++++++++++++++++++++++++                          | 47% ~04s          
  |+++++++++++++++++++++++++                         | 48% ~04s          
  |+++++++++++++++++++++++++                         | 49% ~04s          
  |++++++++++++++++++++++++++                        | 51% ~04s          
  |++++++++++++++++++++++++++                        | 52% ~03s          
  |+++++++++++++++++++++++++++                       | 53% ~03s          
  |+++++++++++++++++++++++++++                       | 54% ~03s          
  |++++++++++++++++++++++++++++                      | 55% ~03s          
  |++++++++++++++++++++++++++++                      | 56% ~03s          
  |+++++++++++++++++++++++++++++                     | 57% ~03s          
  |+++++++++++++++++++++++++++++                     | 58% ~03s          
  |++++++++++++++++++++++++++++++                    | 59% ~03s          
  |++++++++++++++++++++++++++++++                    | 60% ~03s          
  |+++++++++++++++++++++++++++++++                   | 61% ~03s          
  |+++++++++++++++++++++++++++++++                   | 62% ~03s          
  |++++++++++++++++++++++++++++++++                  | 63% ~03s          
  |++++++++++++++++++++++++++++++++                  | 64% ~03s          
  |+++++++++++++++++++++++++++++++++                 | 65% ~03s          
  |+++++++++++++++++++++++++++++++++                 | 66% ~02s          
  |++++++++++++++++++++++++++++++++++                | 67% ~02s          
  |++++++++++++++++++++++++++++++++++                | 68% ~02s          
  |+++++++++++++++++++++++++++++++++++               | 69% ~02s          
  |+++++++++++++++++++++++++++++++++++               | 70% ~02s          
  |++++++++++++++++++++++++++++++++++++              | 71% ~02s          
  |++++++++++++++++++++++++++++++++++++              | 72% ~02s          
  |+++++++++++++++++++++++++++++++++++++             | 73% ~02s          
  |+++++++++++++++++++++++++++++++++++++             | 74% ~02s          
  |++++++++++++++++++++++++++++++++++++++            | 75% ~02s          
  |++++++++++++++++++++++++++++++++++++++            | 76% ~02s          
  |+++++++++++++++++++++++++++++++++++++++           | 77% ~02s          
  |+++++++++++++++++++++++++++++++++++++++           | 78% ~02s          
  |++++++++++++++++++++++++++++++++++++++++          | 79% ~02s          
  |++++++++++++++++++++++++++++++++++++++++          | 80% ~01s          
  |+++++++++++++++++++++++++++++++++++++++++         | 81% ~01s          
  |+++++++++++++++++++++++++++++++++++++++++         | 82% ~01s          
  |++++++++++++++++++++++++++++++++++++++++++        | 83% ~01s          
  |++++++++++++++++++++++++++++++++++++++++++        | 84% ~01s          
  |+++++++++++++++++++++++++++++++++++++++++++       | 85% ~01s          
  |+++++++++++++++++++++++++++++++++++++++++++       | 86% ~01s          
  |++++++++++++++++++++++++++++++++++++++++++++      | 87% ~01s          
  |++++++++++++++++++++++++++++++++++++++++++++      | 88% ~01s          
  |+++++++++++++++++++++++++++++++++++++++++++++     | 89% ~01s          
  |+++++++++++++++++++++++++++++++++++++++++++++     | 90% ~01s          
  |++++++++++++++++++++++++++++++++++++++++++++++    | 91% ~01s          
  |++++++++++++++++++++++++++++++++++++++++++++++    | 92% ~01s          
  |+++++++++++++++++++++++++++++++++++++++++++++++   | 93% ~01s          
  |+++++++++++++++++++++++++++++++++++++++++++++++   | 94% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++++++  | 95% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++++++  | 96% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++++++++ | 97% ~00s          
  |+++++++++++++++++++++++++++++++++++++++++++++++++ | 98% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++++++++| 99% ~00s          
  |++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=07s  
outcells = (end_cells$embolised@reductions$umap@cell.embeddings[,2]>2.3 & 
              end_cells$embolised@reductions$umap@cell.embeddings[,1]<1) # this removes what seem to be central hepatocytes
end_cells$embolised = end_cells$embolised[,!outcells]
end_cells$embolised = end_cells$embolised[,end_cells$embolised@meta.data[,"endo_res.0.8"]!=12] # remove these extra stellate cells

end_cells$embolised = suppressWarnings(SCTransform(end_cells$embolised, do.correct.umi = T, 
                                                 verbose = F,
                                                 vars.to.regress = c("unique_name","nCount_RNA"),
                                                 variable.features.rv.th = 1, seed.use = 1,
                                                 return.only.var.genes = F, 
                                                 variable.features.n = NULL))
end_cells$embolised = RunPCA(end_cells$embolised, verbose = F)
end_cells$embolised = RunUMAP(end_cells$embolised, dims = 1:15, verbose = F)
DimPlot(end_cells$embolised, reduction = "umap", group.by = "unique_name")



outcells = (end_cells$regenerating@reductions$umap@cell.embeddings[,2]>2.7 & 
              end_cells$regenerating@reductions$umap@cell.embeddings[,1]<(-1)) # this removes what seem to be central hepatocytes
end_cells$regenerating = end_cells$regenerating[,!outcells]

end_cells$regenerating = suppressWarnings(SCTransform(end_cells$regenerating, do.correct.umi = T, 
                                                 verbose = F,
                                                 vars.to.regress = c("unique_name","nCount_RNA"),
                                                 variable.features.rv.th = 1, seed.use = 1,
                                                 return.only.var.genes = F, 
                                                 variable.features.n = NULL))
end_cells$regenerating = RunPCA(end_cells$regenerating, verbose = F)
end_cells$regenerating = RunUMAP(end_cells$regenerating, dims = 1:15, verbose = F)
DimPlot(end_cells$regenerating, reduction = "umap", group.by = "unique_name")

Calculate correlations

sig_genes = gene_fit_p[gene_fit_p<=0.05]
top_sig_genes = names(sig_genes[order(sig_genes, decreasing = F)][1:1000])
top_sig_genes = top_sig_genes[top_sig_genes %in% rownames(end_cells$embolised@assays$SCT@data) &
                                top_sig_genes %in%  rownames(end_cells$regenerating@assays$SCT@data)]

data_gam_h = cbind(data.frame("DPT" = df_dc$DPT),
                   Matrix::t(end_cells$healthy@assays$SCT@data[top_sig_genes,]))
data_gam_h$DPT = scales::rescale(rank(-data_gam_h$DPT), c(0.00001, 0.99999))

gam_healthy = gam::gam(DPT ~ ., data = data_gam_h, family = mgcv::betar(link = "logit", eps = 0.00001))

hea_pred = predict(gam_healthy,
                   Matrix::t(end_cells$health@assays$SCT@data[top_sig_genes,]), type= "response")
reg_pred = predict(gam_healthy,
                   Matrix::t(end_cells$regenerating@assays$SCT@data[top_sig_genes,]), type= "response")
emb_pred = predict(gam_healthy,
                   Matrix::t(end_cells$embolised@assays$SCT@data[top_sig_genes,]), type= "response")

plot(density(data_gam_h$DPT), col = "green", ylim = c(0,2.75))
lines(density(reg_pred), col = "blue")
lines(density(emb_pred), col = "red")
lines(density(hea_pred), col = "grey50")
legend(-0.073,32, legend = c("healthy (DC1)", "healthy (pred)", 
                         "embolised (pred)", "regenerating (pred)"),
       fill = c("green", "grey50", "red", "blue"))


plot_df = data.frame(vals = c(data_gam_h$DPT, emb_pred, reg_pred), 
                     Condition = c(rep("healthy", length(data_gam_h$DPT)), 
                                   rep("embolised", length(emb_pred)), 
                                   rep("regenerating", length(reg_pred))),
                     Donor = c(end_cells$healthy$Donor, 
                               end_cells$embolised$Donor, end_cells$regenerating$Donor),
                     ct = c(end_cells$healthy$allcells_simp, 
                               end_cells$embolised$allcells_simp, end_cells$regenerating$allcells_simp))
plot_df$bins100 = cut(plot_df$vals, 10)
plot_df$Condition = factor(plot_df$Condition, levels = c("healthy", "embolised", "regenerating"))
ggplot(plot_df, aes(x = bins100, fill = Condition))+
     geom_bar(position = "fill")+
     labs(x = "zonation (binned)", y = "proportion")+
     scale_y_continuous(expand = c(0,0))+
     scale_fill_manual(values = colcond)+
     guides(fill = guide_legend(title.position = "top"))+
     theme(axis.text.x = element_blank(),
           legend.title.align = 0,
           legend.position = "bottom")

ggplot(plot_df, aes(x = bins100, fill = Condition))+
     facet_wrap(~Condition)+
     geom_bar()+
     labs(x = "zonation (binned)", y = "Number of cells")+
     scale_y_continuous(expand = c(0,0))+
     scale_fill_manual(values = colcond)+
     coord_flip()+
     guides(fill = guide_legend(title.position = "top"))+
     theme(axis.text.y = element_blank(),
           legend.title.align = 0,
           legend.position = "bottom")

ggplot(plot_df, aes(x = vals, colour = Donor))+
     facet_wrap(~Condition)+
     geom_density()+
     labs(x = "zonation (binned)", y = "proportion")+
     theme_bw()+
     theme(legend.title.align = 0,
           legend.position = "bottom")

ggplot(plot_df, aes(x = vals, colour = ct))+
     facet_wrap(~Condition)+
     geom_density()+
     labs(x = "zonation (binned)", y = "proportion")+
     theme_bw()+
     theme(legend.title.align = 0,
           legend.position = "bottom")


reg_df = data.frame("pt" = reg_pred, "don" = end_cells$regenerating$Donor)
reg_df = cbind(reg_df, Matrix::t(end_cells$regenerating@assays$SCT@data[gfresh,]))
ggplot(reg_df, aes(x = pt, y = CLEC1B, colour = don))+
  geom_point()+
  theme_classic()+
  facet_wrap(~don)+
  geom_hline(yintercept = 3)+
  geom_vline(xintercept = c(0,1))


ggplot(reg_df, aes(x = CLEC1B, y = LYVE1, colour = don))+
  geom_point()+
  theme_classic()+
  facet_wrap(~don)


traj_list = list("healthy" = data_gam_h$DPT, "embolised" = emb_pred, "regenerating" = reg_pred)

Find varying genes - using all cells along the ranked pseudotime

cor_h = t(cor(data_gam_h$DPT, as.matrix(Matrix::t(end_cells$healthy@assays$SCT@data)),
              method = "sp"))
cor_emb = t(cor(emb_pred, as.matrix(Matrix::t(end_cells$embolised@assays$SCT@data)),
                method = "sp"))
cor_reg = t(cor(reg_pred, as.matrix(Matrix::t(end_cells$regenerating@assays$SCT@data)),
                method = "sp"))

l = list(cor_h, cor_emb, cor_reg)
all_corr = Reduce(function(x, y) merge(x,y, by = "rn", all = T), 
                  lapply(l, function(x) data.frame(x, rn = row.names(x))))
colnames(all_corr) = c("genes", "healthy", "embolised", "regenerating")

write.csv(all_corr, file = paste0("results/zonation_cond/end_correlations_zonation_rank.csv"),
          col.names = T, row.names = F, quote = F)
attempt to set 'col.names' ignored

Save trajectories and genes

save(fits_list, qvals_list, traj_list, 
     file = "results/zonation_cond/end_traj_qval_fits_rank.RData")

Find varying genes - normalised by bins along the ranked pseudotime

qvals_b_list = list()
fits_b_list = list()

# get HVG from all conditions
hvg = unique(c(end_cells[["healthy"]]@assays$SCT@var.features,
               end_cells[["embolised"]]@assays$SCT@var.features,
               end_cells[["regenerating"]]@assays$SCT@var.features))
hvg = hvg[hvg %in% rownames(end_cells[["healthy"]]@assays$SCT@data) &
            hvg %in% rownames(end_cells[["embolised"]]@assays$SCT@data) &
            hvg %in% rownames(end_cells[["regenerating"]]@assays$SCT@data)]

for(cond in names(end_cells)){
  print(cond)
  set.seed(1)
  # filtering the cells by bins
  bins = cut(traj_list[[cond]], 100)
  dons = end_cells[[cond]]$Donor
  ndons = rowSums(table(bins, dons)>1)
  ncells = rowSums(table(bins, dons))
  sampleif = function(cells, bins, dons){
    res = c()
    for(d in unique(dons)){
      for(b in unique(bins)){
        x = cells[bins==b & dons==d]
        if(length(x)>=30){
          res = c(res, sample(x, 30, replace = F))
        } else if(length(x)>=1 & ndons[b]>=2 & ncells[b]>=10){
          res = c(res, x)
        } else{
          res = c(res, NULL)
        }
      }
    }
    return(res)
  }
  whichcells = sampleif(1:length(bins), bins, dons)
  
  # Fit GAM for each gene using pseudotime as independent variable.
  t = traj_list[[cond]][whichcells]
  gene_fit_p = c()
  gene_fit_vals = data.frame(row.names = 1:100)
  for(i in 1:length(hvg)){
    g = hvg[i]
    z = end_cells[[cond]]@assays$SCT@data[g,whichcells]
    
    d = data.frame(z=z, t=t)
    tmp = suppressMessages(gam(z ~ ns(t, df = 3), data=d))
    
    # bins for model fitting
    bb = seq(min(t), max(t), length.out = 100)
    gene_fit_vals[,g] = suppressMessages(predict(tmp,
                                                 newdata = data.frame(t = bb)))
    p = summary(tmp)$parametric.anova$`Pr(>F)`[1]
    gene_fit_p = c(gene_fit_p, p)
    names(gene_fit_p)[length(gene_fit_p)] = g
  }
  
  if(sum(is.na(gene_fit_p))>0){
    gene_fit_p[is.na(gene_fit_p)] = 1
  }
  gene_fit_p = fdrtool::fdrtool(gene_fit_p, statistic="pvalue", plot=F, 
                                verbose=F, cutoff.method="pct0", pct0=0.9)$qval
  
  qvals_b_list[[cond]] = gene_fit_p
  fits_b_list[[cond]] = gene_fit_vals
}

Save trajectories and genes

save(fits_b_list, qvals_b_list, traj_list, 
     file = "results/zonation_cond/end_binned_traj_qval_fits_rank.RData")

Find varying genes - using each donor as a replicate and calculating the average expression for each of 100 bins

save(fits_b_list, qvals_b_list, traj_list, 
     file = "results/zonation_cond/end_binned_traj_qval_fits_rank.RData")

Save trajectories and genes

qvals_db_list = list()
fits_db_list = list()

# get HVG from all conditions
hvg = unique(c(end_cells[["healthy"]]@assays$SCT@var.features,
               end_cells[["embolised"]]@assays$SCT@var.features,
               end_cells[["regenerating"]]@assays$SCT@var.features))
hvg = hvg[hvg %in% rownames(end_cells[["healthy"]]@assays$SCT@data) &
            hvg %in% rownames(end_cells[["embolised"]]@assays$SCT@data) &
            hvg %in% rownames(end_cells[["regenerating"]]@assays$SCT@data)]

meta_df = data.frame("pt" = c(traj_list$healthy, traj_list$embolised, traj_list$regenerating),
                     "donors" = c(end_cells$healthy$Donor, end_cells$embolised$Donor,
                                  end_cells$regenerating$Donor),
                     "cond" = c(end_cells$healthy$Condition, end_cells$embolised$Condition,
                                  end_cells$regenerating$Condition))
meta_df$bins = cut(meta_df$pt, 100)

for(cond in unique(meta_df$cond)){
  print(cond)
  set.seed(1)
  
  # Fit GAM for each gene
  t = 1:100
  sub_meta_df = meta_df[meta_df$cond==cond,]
  gene_fit_p = c()
  gene_fit_vals = data.frame(row.names = 1:100)
  for(i in 1:length(hvg)){
    g = hvg[i]
    z = end_cells[[cond]]@assays$SCT@data[g,]
    
    d = aggregate(z~bins+donors, data = cbind(sub_meta_df, z), FUN="mean")
    cc = data.frame(table(sub_meta_df$bins, as.factor(as.character(sub_meta_df$donors))))
    cc = cc[cc$Freq==0,]
    colnames(cc) = colnames(d)
    d = rbind(d, cc)
    d = d[order(d$bins, decreasing = F),]
    d$t = rep(t, each = length(unique(d$donors)))
    tmp = suppressMessages(gam(z ~ ns(t, df = 3), data=d))
    
    # bins for model fitting
    bb = seq(min(t), max(t), length.out = 100)
    gene_fit_vals[,g] = suppressMessages(predict(tmp,
                                                 newdata = data.frame(t = bb)))
    p = summary(tmp)$parametric.anova$`Pr(>F)`[1]
    gene_fit_p = c(gene_fit_p, p)
    names(gene_fit_p)[length(gene_fit_p)] = g
  }
  
  if(sum(is.na(gene_fit_p))>0){
    gene_fit_p[is.na(gene_fit_p)] = 1
  }
  gene_fit_p = fdrtool::fdrtool(gene_fit_p, statistic="pvalue", plot=F, 
                                verbose=F, cutoff.method="pct0", pct0=0.9)$qval
  
  qvals_db_list[[cond]] = gene_fit_p
  fits_db_list[[cond]] = gene_fit_vals
}
[1] "healthy"
[1] "embolised"
[1] "regenerating"

Plot cell distributions in pseudotime

save(fits_db_list, qvals_db_list, traj_list, 
     file = "results/zonation_cond/end_binnedDonor_traj_qval_fits_rank.RData")

Do cross validation on the healthy trajectory

plot_df = data.frame("pseudotime" = c(traj_list$healthy, traj_list$embolised,
                                      traj_list$regenerating),
           "donor" = c(end_cells$healthy@meta.data[,"Donor"],
                       end_cells$embolised@meta.data[names(traj_list$embolised),"Donor"],
                       end_cells$regenerating@meta.data[names(traj_list$regenerating),"Donor"]),
           "cond" = c(rep("healthy", length(traj_list$healthy)), 
                      rep("embolised", length(traj_list$embolised)),
                      rep("regenerating", length(traj_list$regenerating))))

pdf("results/zonation_cond/end_cell_distributions_zonation.pdf", useDingbats = F, 
    width = 6, height = 5)
ggplot(plot_df, aes(x = donor, y = pseudotime, fill = cond))+
  geom_boxplot(notch = T)+
  theme_bw()

ggplot(plot_df, aes(x = pseudotime, group = donor, colour = donor))+
  facet_wrap(~cond)+
  geom_density()+
  theme_bw()

ggplot(plot_df, aes(x = pseudotime, colour = cond))+
  geom_density()+
  theme_bw()
dev.off()
null device 
          1 

Save zonation results

sig_genes = qvals_b_list$healthy[qvals_b_list$healthy<=0.05]
top_sig_genes = names(sig_genes[order(sig_genes, decreasing = F)][1:1000]) 
# 500 in this case actually seems to be one of the models with the lowest MSE

data_gam_h = cbind(data.frame("DPT" = traj_list$healthy),
                   Matrix::t(end_cells$healthy@assays$SCT@data[top_sig_genes,]))
colnames(data_gam_h) = gsub("-", "_", colnames(data_gam_h), fixed = T)
colnames(data_gam_h) = gsub(".", "_", colnames(data_gam_h), fixed = T)

cv_gam = CVgam(data = data_gam_h, formula = DPT ~ ., nfold = 10, seed = 1, method = "glm.fit")
CV-mse-GAM  
     0.0111 
pdf("results/zonation_healthy/end_original_fitted_pseudotime.pdf", useDingbats = F, 
    width = 6, height = 5)
plot(data_gam_h$DPT, cv_gam$fitted, xlab = "Healthy pseudotime (DPT363)",ylab = "Predicted Healthy",
     main = paste0("Healthy original vs fitted pseudotime\nMSE: ", round(cv_gam$cvscale, 6)),
     pch = 19, cex = 0.5)
abline(0,1, col = "blue", lwd = 3)
dev.off()
null device 
          1 

Comparing genes between conditions

Load data

end_cells = readRDS(file = "results/zonation_cond/end_cells_zonation_rank.RDS")
hep_cells = readRDS(file = "results/zonation_cond/hep_cells_zonation_rank.RDS")

Load fits

end_cells = readRDS(file = "results/zonation_cond/end_cells_zonation_rank.RDS")
hep_cells = readRDS(file = "results/zonation_cond/hep_cells_zonation_rank.RDS")

Calculate expression per bins

load("results/zonation_cond/hep_binnedDonor_traj_qval_fits_rank.RData")
hep_fits_list = fits_db_list
hep_qvals_list = qvals_db_list
hep_traj = traj_list
load("results/zonation_cond/end_binnedDonor_traj_qval_fits_rank.RData")
end_fits_list = fits_db_list
end_qvals_list = qvals_db_list
end_traj = traj_list

Correlate binned expression

nbins = 100

dfhep_h = data.table("zonation_pt" = hep_traj$healthy)
dfhep_h$zonation_int = cut(dfhep_h$zonation_pt, nbins)
dfhep_e = data.table("zonation_pt" = hep_traj$embolised)
dfhep_e = dfhep_h[dfhep_e, on="zonation_pt", roll=Inf, rollends = T]
dfhep_r = data.table("zonation_pt" = hep_traj$regenerating)
dfhep_r = dfhep_h[dfhep_r, on="zonation_pt", roll=Inf, rollends = T]

hep_mean_bins = list(
healthy = apply(hep_cells$healthy@assays$SCT@data, 1, 
                        function(x) tapply(x, dfhep_h$zonation_int, mean)),
embolised = apply(hep_cells$embolised@assays$SCT@data, 1, 
                        function(x) tapply(x, dfhep_e$zonation_int, mean)),
regenerating = apply(hep_cells$regenerating@assays$SCT@data, 1, 
                        function(x) tapply(x, dfhep_r$zonation_int, mean))
)

Correlate fitted expression

comb_cond = combn(names(end_cells), 2)
hep_cor_fit_list = list()
for(i in 1:ncol(comb_cond)){
  c1 = comb_cond[1,i]
  c2 = comb_cond[2,i]
  
  genes = intersect(colnames(hep_fits_list[[c1]]),colnames(hep_fits_list[[c2]]))
  
  for(g in genes){
    hep_cor_fit_list[[paste0(c1,"_",c2)]][[g]] = cor(cbind(hep_fits_list[[c1]][,g],
                                                       hep_fits_list[[c2]][,g]),
                                                 method = "sp")[1,2]
  }
}
hep_cor_fit_list = lapply(hep_cor_fit_list, data.frame)
hep_cor_fit_list = lapply(hep_cor_fit_list, function(x) cbind(rownames(x), x))

end_cor_fit_list = list()
for(i in 1:ncol(comb_cond)){
  c1 = comb_cond[1,i]
  c2 = comb_cond[2,i]
  
  genes = intersect(colnames(end_fits_list[[c1]]),colnames(end_fits_list[[c2]]))
  
  for(g in genes){
    end_cor_fit_list[[paste0(c1,"_",c2)]][[g]] = cor(cbind(end_fits_list[[c1]][,g],
                                                       end_fits_list[[c2]][,g]),
                                                 method = "sp")[1,2]
  }
}
end_cor_fit_list = lapply(end_cor_fit_list, data.frame)
end_cor_fit_list = lapply(end_cor_fit_list, function(x) cbind(rownames(x), x))

Save correlations

save(hep_cor_list, end_cor_list, 
     file = "results/zonation_cond/zonation_cor_cond_lists_rank.RData")

Filter pathways with ribosomal genes

save(hep_cor_list, end_cor_list, 
     file = "results/zonation_cond/zonation_cor_cond_lists_rank.RData")

Hepatocytes

Get the top genes for each condition

rpfilter = function(x){
  return(!grepl("^RPL", x$geneID) & !grepl("^RPS", x$geneID) & 
           !grepl("/RPL", x$geneID, fixed = T) & !grepl("/RPS", x$geneID, fixed = T))
}

Find enriched GO Terms and Reactome pathways

# get signifficant genes showing variation (at least 0.1 range in the fitted values)
genes_list = list()
for(n in names(hep_fits_list)){
  h_fc_sp = apply(hep_fits_list[[n]], 2, function(x) diff(range(x)))
  genes_c = names(hep_qvals_list[[n]])[hep_qvals_list[[n]]<=0.05 &
                    names(hep_qvals_list[[n]]) %in% names(h_fc_sp)[h_fc_sp>=0.1]]
  genes_c = genes_c[genes_c %in% names(hep_qvals_list[[n]])[order(hep_qvals_list[[n]], decreasing = F)]]
  genes_list[[n]] = genes_c
}

Identify pathways more affected by one condition

anot_list = list()
for(n in names(terms_groups_list)){
  terms_groups = terms_groups_list[[n]]
  df_cor = df_cor_list[[n]]
  
  # get the pathways for all genes, minus those with ribosomal genes
  all_path = terms_groups$all[rpfilter(terms_groups$all),]
  all_path = all_path[order(all_path$qvalue, decreasing = F),]
  
  # get gene groups as a data frame
  g_g = unlist(apply(df_cor[,c(1,3,4,5)], 1, 
                     function(x) colnames(df_cor[,c(3,4,5)])[which(as.logical(x[2:4]))]))
  gene_group = data.frame("gene" = names(g_g),
                          "group" = g_g)
  
  # get fraction of genes from a condition that are present in a give pathway
  n_g_group = list()
  n_g_group[[levels(gene_group$group)[1]]] = c()
  n_g_group[[levels(gene_group$group)[2]]] = c()
  n_g_group[[levels(gene_group$group)[3]]] = c()
  for(i in 1:nrow(all_path)){
    genes_p = unlist(strsplit(all_path$geneID[i], "/"))
    for(gg in unique(gene_group$group)){
      ngg = sum(genes_p %in% gene_group$gene[gene_group$group==gg])/sum(gene_group$group==gg)
      n_g_group[[gg]] = c(n_g_group[[gg]], ngg)
    }
  }
  all_path = cbind(all_path, data.frame(n_g_group))
  all_path$diff = all_path[,13] - all_path[,12] # add difference between cond and healthy
  all_path$highest = colnames(all_path)[11:13][apply(all_path[,11:13], 1, which.max)]
  
  # group the detected pathways by similarity
  go_path = all_path[all_path$DB=="GO Term",]
  mat_path = matrix(0, nrow(go_path), nrow(go_path))
  for(i in 1:nrow(go_path)){
    g1 = go_path[i,"geneID"]
    g1 = unlist(strsplit(g1, "/"))
    for(j in 1:nrow(go_path)){
        g2 = go_path[j,"geneID"]
        g2 = unlist(strsplit(g2, "/"))
        mat_path[i,j] = length(intersect(g1, g2))/mean(c(length(g1), length(g2)))
    }
  }
  cl_go = hclust(dist(mat_path), method = "ward.D2")
  cl_go = cutree(cl_go, 7)
  rownames(mat_path) = go_path$Description
  anot = data.frame(row.names = go_path$Description, 
                    "cl" = paste0("cl",cl_go), 
                    "n" = go_path$Count,
                    "diff" = go_path$diff,
                    "qval" = -log10(go_path$qvalue),
                    "term" = go_path$Description)
  pheatmap::pheatmap(mat_path, clustering_method = "ward.D2", 
                     annotation_row = anot[,-5], show_rownames = F)
  genes_per_gocl = tapply(anot$term, anot$cl, 
         function(x) unique(unlist(strsplit(go_path[go_path$Description %in% x,"geneID"], "/"))))
  genes_cond_gocl = lapply(genes_per_gocl, function(x) lapply(genes_conds, function(y) sum(y %in% x)))
  genes_cond_gocl = Reduce(rbind, genes_cond_gocl)
  rownames(genes_cond_gocl) = paste0("cl", 1:7)
  
  anot_list[[n]] = anot
}

Plot top terms

anot_list = list()
for(n in names(terms_groups_list)){
  terms_groups = terms_groups_list[[n]]
  df_cor = df_cor_list[[n]]
  
  # get the pathways for all genes, minus those with ribosomal genes
  all_path = terms_groups$all[rpfilter(terms_groups$all),]
  all_path = all_path[order(all_path$qvalue, decreasing = F),]
  
  # get gene groups as a data frame
  g_g = unlist(apply(df_cor[,c(1,3,4,5)], 1, 
                     function(x) colnames(df_cor[,c(3,4,5)])[which(as.logical(x[2:4]))]))
  gene_group = data.frame("gene" = names(g_g),
                          "group" = g_g)
  
  # get fraction of genes from a condition that are present in a give pathway
  n_g_group = list()
  n_g_group[[levels(gene_group$group)[1]]] = c()
  n_g_group[[levels(gene_group$group)[2]]] = c()
  n_g_group[[levels(gene_group$group)[3]]] = c()
  for(i in 1:nrow(all_path)){
    genes_p = unlist(strsplit(all_path$geneID[i], "/"))
    for(gg in unique(gene_group$group)){
      ngg = sum(genes_p %in% gene_group$gene[gene_group$group==gg])/sum(gene_group$group==gg)
      n_g_group[[gg]] = c(n_g_group[[gg]], ngg)
    }
  }
  all_path = cbind(all_path, data.frame(n_g_group))
  all_path$diff = all_path[,13] - all_path[,12] # add difference between cond and healthy
  all_path$highest = colnames(all_path)[11:13][apply(all_path[,11:13], 1, which.max)]
  
  # group the detected pathways by similarity
  go_path = all_path[all_path$DB=="GO Term",]
  mat_path = matrix(0, nrow(go_path), nrow(go_path))
  for(i in 1:nrow(go_path)){
    g1 = go_path[i,"geneID"]
    g1 = unlist(strsplit(g1, "/"))
    for(j in 1:nrow(go_path)){
        g2 = go_path[j,"geneID"]
        g2 = unlist(strsplit(g2, "/"))
        mat_path[i,j] = length(intersect(g1, g2))/mean(c(length(g1), length(g2)))
    }
  }
  cl_go = hclust(dist(mat_path), method = "ward.D2")
  cl_go = cutree(cl_go, 7)
  rownames(mat_path) = go_path$Description
  anot = data.frame(row.names = go_path$Description, 
                    "cl" = paste0("cl",cl_go), 
                    "n" = go_path$Count,
                    "diff" = go_path$diff,
                    "qval" = -log10(go_path$qvalue),
                    "term" = go_path$Description)
  pheatmap::pheatmap(mat_path, clustering_method = "ward.D2", 
                     annotation_row = anot[,-5], show_rownames = F)
  genes_per_gocl = tapply(anot$term, anot$cl, 
         function(x) unique(unlist(strsplit(go_path[go_path$Description %in% x,"geneID"], "/"))))
  genes_cond_gocl = lapply(genes_per_gocl, function(x) lapply(genes_conds, function(y) sum(y %in% x)))
  genes_cond_gocl = Reduce(rbind, genes_cond_gocl)
  rownames(genes_cond_gocl) = paste0("cl", 1:7)
  
  anot_list[[n]] = anot
}

Plot some genes

pdf("results/zonation_cond/hep_top_go_terms_genes.pdf", 
    height = 6, width = 7, useDingbats = F)
for(n in names(terms_groups_list)){
  terms_groups = terms_groups_list[[n]]
  
  for(cond in names(terms_groups)){
    plot_df = terms_groups[[cond]][rpfilter(terms_groups[[cond]]),]
    plot_df = plot_df[order(plot_df$qvalue, decreasing = F),]
    plot_df$Description = factor(plot_df$Description, levels = rev(plot_df$Description))
    
    plt = ggplot(plot_df[1:10,], aes(x = Description, y = -log10(qvalue)))+
      geom_bar(stat = "identity")+
      coord_flip()+
      labs(title = paste0(n, ": ", cond))+
      theme_bw()
    print(plt)
  }
}
dev.off()
null device 
          1 
pathway = "negative regulation of apoptotic signaling pathway"
unique(exp_plt_list$healthy_regenerating$regenerating[[pathway]]$variable)
gn = "ICAM1"
plot_df = exp_plt_list$healthy_regenerating$regenerating[[pathway]]
plot_df = plot_df[plot_df$variable==gn,]
c1 = "healthy"
c2 = "regenerating"
pw = "regenerating"
ggplot(plot_df, aes(x = xx, y = value, group = interaction(variable, cond)))+
    geom_line(mapping = aes(colour = cond))+
    labs(title = pathway, subtitle = paste0(c1, " vs ", c2, ", ", pw, " pathways"), 
         y = paste0("log(norm exp) ", gn))+
    theme_classic()

Save important lists

save(terms_groups_list, anot_list, df_cor_list, genes_list, plot_list, exp_plt_list,
     file = "results/zonation_cond/hep_go_results_rank.RData")

Endothelial cells

Get the top genes for each condition

save(terms_groups_list, anot_list, df_cor_list, genes_list, plot_list, exp_plt_list,
     file = "results/zonation_cond/hep_go_results_rank.RData")

Find enriched GO Terms and Reactome pathways

# get signifficant genes showing variation (at least 0.1 range in the fitted values)
genes_list = list()
for(n in names(end_fits_list)){
  h_fc_sp = apply(end_fits_list[[n]], 2, function(x) diff(range(x)))
  genes_c = names(end_qvals_list[[n]])[end_qvals_list[[n]]<=0.05 &
                    names(end_qvals_list[[n]]) %in% names(h_fc_sp)[h_fc_sp>=0.1]]
  genes_c = genes_c[genes_c %in% names(end_qvals_list[[n]])[order(end_qvals_list[[n]], decreasing = F)]]
  genes_list[[n]] = genes_c
}

Identify pathways more affected by one condition

anot_list = list()
for(n in names(terms_groups_list)){
  terms_groups = terms_groups_list[[n]]
  df_cor = df_cor_list[[n]]
  
  # get the pathways for all genes, minus those with ribosomal genes
  all_path = terms_groups$all[rpfilter(terms_groups$all),]
  all_path = all_path[order(all_path$qvalue, decreasing = F),]
  
  # get gene groups as a data frame
  g_g = unlist(apply(df_cor[,c(1,3,4,5)], 1, 
                     function(x) colnames(df_cor[,c(3,4,5)])[which(as.logical(x[2:4]))]))
  gene_group = data.frame("gene" = names(g_g),
                          "group" = g_g)
  
  # get fraction of genes from a condition that are present in a give pathway
  n_g_group = list()
  n_g_group[[levels(gene_group$group)[1]]] = c()
  n_g_group[[levels(gene_group$group)[2]]] = c()
  n_g_group[[levels(gene_group$group)[3]]] = c()
  for(i in 1:nrow(all_path)){
    genes_p = unlist(strsplit(all_path$geneID[i], "/"))
    for(gg in unique(gene_group$group)){
      ngg = sum(genes_p %in% gene_group$gene[gene_group$group==gg])/sum(gene_group$group==gg)
      n_g_group[[gg]] = c(n_g_group[[gg]], ngg)
    }
  }
  all_path = cbind(all_path, data.frame(n_g_group))
  all_path$diff = all_path[,13] - all_path[,12] # add difference between cond and healthy
  all_path$highest = colnames(all_path)[11:13][apply(all_path[,11:13], 1, which.max)]
  
  # group the detected pathways by similarity
  go_path = all_path[all_path$DB=="GO Term",]
  mat_path = matrix(0, nrow(go_path), nrow(go_path))
  for(i in 1:nrow(go_path)){
    g1 = go_path[i,"geneID"]
    g1 = unlist(strsplit(g1, "/"))
    for(j in 1:nrow(go_path)){
        g2 = go_path[j,"geneID"]
        g2 = unlist(strsplit(g2, "/"))
        mat_path[i,j] = length(intersect(g1, g2))/mean(c(length(g1), length(g2)))
    }
  }
  cl_go = hclust(dist(mat_path), method = "ward.D2")
  cl_go = cutree(cl_go, 7)
  rownames(mat_path) = go_path$Description
  anot = data.frame(row.names = go_path$Description, 
                    "cl" = paste0("cl",cl_go), 
                    "n" = go_path$Count,
                    "diff" = go_path$diff,
                    "qval" = -log10(go_path$qvalue),
                    "term" = go_path$Description)
  pheatmap::pheatmap(mat_path, clustering_method = "ward.D2", 
                     annotation_row = anot[,-5], show_rownames = F)
  genes_per_gocl = tapply(anot$term, anot$cl, 
         function(x) unique(unlist(strsplit(go_path[go_path$Description %in% x,"geneID"], "/"))))
  genes_cond_gocl = lapply(genes_per_gocl, function(x) lapply(genes_conds, function(y) sum(y %in% x)))
  genes_cond_gocl = Reduce(rbind, genes_cond_gocl)
  rownames(genes_cond_gocl) = paste0("cl", 1:7)
  
  anot_list[[n]] = anot
}

Plot top terms

anot_list = list()
for(n in names(terms_groups_list)){
  terms_groups = terms_groups_list[[n]]
  df_cor = df_cor_list[[n]]
  
  # get the pathways for all genes, minus those with ribosomal genes
  all_path = terms_groups$all[rpfilter(terms_groups$all),]
  all_path = all_path[order(all_path$qvalue, decreasing = F),]
  
  # get gene groups as a data frame
  g_g = unlist(apply(df_cor[,c(1,3,4,5)], 1, 
                     function(x) colnames(df_cor[,c(3,4,5)])[which(as.logical(x[2:4]))]))
  gene_group = data.frame("gene" = names(g_g),
                          "group" = g_g)
  
  # get fraction of genes from a condition that are present in a give pathway
  n_g_group = list()
  n_g_group[[levels(gene_group$group)[1]]] = c()
  n_g_group[[levels(gene_group$group)[2]]] = c()
  n_g_group[[levels(gene_group$group)[3]]] = c()
  for(i in 1:nrow(all_path)){
    genes_p = unlist(strsplit(all_path$geneID[i], "/"))
    for(gg in unique(gene_group$group)){
      ngg = sum(genes_p %in% gene_group$gene[gene_group$group==gg])/sum(gene_group$group==gg)
      n_g_group[[gg]] = c(n_g_group[[gg]], ngg)
    }
  }
  all_path = cbind(all_path, data.frame(n_g_group))
  all_path$diff = all_path[,13] - all_path[,12] # add difference between cond and healthy
  all_path$highest = colnames(all_path)[11:13][apply(all_path[,11:13], 1, which.max)]
  
  # group the detected pathways by similarity
  go_path = all_path[all_path$DB=="GO Term",]
  mat_path = matrix(0, nrow(go_path), nrow(go_path))
  for(i in 1:nrow(go_path)){
    g1 = go_path[i,"geneID"]
    g1 = unlist(strsplit(g1, "/"))
    for(j in 1:nrow(go_path)){
        g2 = go_path[j,"geneID"]
        g2 = unlist(strsplit(g2, "/"))
        mat_path[i,j] = length(intersect(g1, g2))/mean(c(length(g1), length(g2)))
    }
  }
  cl_go = hclust(dist(mat_path), method = "ward.D2")
  cl_go = cutree(cl_go, 7)
  rownames(mat_path) = go_path$Description
  anot = data.frame(row.names = go_path$Description, 
                    "cl" = paste0("cl",cl_go), 
                    "n" = go_path$Count,
                    "diff" = go_path$diff,
                    "qval" = -log10(go_path$qvalue),
                    "term" = go_path$Description)
  pheatmap::pheatmap(mat_path, clustering_method = "ward.D2", 
                     annotation_row = anot[,-5], show_rownames = F)
  genes_per_gocl = tapply(anot$term, anot$cl, 
         function(x) unique(unlist(strsplit(go_path[go_path$Description %in% x,"geneID"], "/"))))
  genes_cond_gocl = lapply(genes_per_gocl, function(x) lapply(genes_conds, function(y) sum(y %in% x)))
  genes_cond_gocl = Reduce(rbind, genes_cond_gocl)
  rownames(genes_cond_gocl) = paste0("cl", 1:7)
  
  anot_list[[n]] = anot
}

Plot some genes

# plot all genes from pathway in grey, average in colour for cond and healthy
plot_list = list()
exp_plt_list = list()
for(comp in names(terms_groups_list)){
  print(comp)
  c1 = strsplit(comp, "_")[[1]][1]
  c2 = strsplit(comp, "_")[[1]][2]
  
  plot_list[[comp]] = list()
  exp_plt_list[[comp]] = list()
  for(pw in c(c1, c2)){
    goterms = terms_groups_list[[comp]][[pw]]
    goterms = goterms[goterms$DB=="GO Term" & rpfilter(goterms),]
    
    exp_plt_list[[comp]][[pw]] = list()
    plot_list[[comp]][[pw]] = list()
    if(nrow(goterms)>0){
      for(go in 1:nrow(goterms)){
        genes_path = unlist(strsplit(goterms[go,"geneID"],"/"))
        
        exp_healthy = reshape2::melt(end_fits_list[[c1]][,genes_path])
        exp_healthy$xx = rep(1:100, length(genes_path))
        exp_healthy$cond = c1
        
        exp_cond = reshape2::melt(end_fits_list[[c2]][,genes_path])
        exp_cond$xx = rep(1:100, length(genes_path))
        exp_cond$cond = c2
  
        exp_df = rbind(exp_healthy, exp_cond)
  
        plt = ggplot(exp_df, aes(x = xx, y = value, group = interaction(variable, cond)))+
          #geom_line(mapping = aes(colour = cond), alpha = 0.2)+
          stat_smooth(mapping = aes(group = cond, colour = cond), show.legend = T)+
          labs(title = goterms[go,"Description"], 
               subtitle = paste0(c1, " vs ", c2, ", ", pw, " pathways"))+
          theme_classic()
        
        plot_list[[comp]][[pw]][[goterms[go,"Description"]]] = plt
        exp_plt_list[[comp]][[pw]][[goterms[go,"Description"]]] = exp_df
      }
    }
  }
}
pathway = "cellular response to type I interferon"
unique(exp_plt_list$healthy_regenerating$regenerating[[pathway]]$variable)
gn = "EGR1"
plot_df = exp_plt_list$healthy_regenerating$regenerating[[pathway]]
plot_df = plot_df[plot_df$variable==gn,]
c1 = "healthy"
c2 = "regenerating"
pw = "regenerating"
ggplot(plot_df, aes(x = xx, y = value, group = interaction(variable, cond)))+
    geom_line(mapping = aes(colour = cond))+
    labs(title = pathway, subtitle = paste0(c1, " vs ", c2, ", ", pw, " pathways"), 
         y = paste0("log(norm exp) ", gn))+
    theme_classic()

Save important lists

save(terms_groups_list, anot_list, df_cor_list, plot_list, exp_plt_list,
     file = "results/zonation_cond/end_go_results_rank.RData")

OLD CODE

Find genes with differencial variation - not very interesting since most genes seem to be showing a variation

save(terms_groups_list, anot_list, df_cor_list, plot_list, exp_plt_list,
     file = "results/zonation_cond/end_go_results_rank.RData")

DTW condition vs condition

cond_comb = combn(names(qvals_list), 2)
for(i in 1:ncol(cond_comb)){
  genes = intersect(names(qvals_list[[cond_comb[1,i]]])[qvals_list[[cond_comb[1,i]]]<=0.05],
                    names(qvals_list[[cond_comb[2,i]]])[qvals_list[[cond_comb[2,i]]]<=0.05])
  
  sc_list = list()
  for(cond in cond_comb[,i]){
    gene_fit_vals = fits_list[[cond]]
    gene_fit_vals_filt = t(gene_fit_vals[,genes])
    sc_list[[cond]] = t(apply(gene_fit_vals_filt, 1, scale))
  }
  
  xxx = dtw(sc_list[[1]], sc_list[[2]], window.type = "itakura", dist.method = "DTW")
  yyy = dtw(dist(t(sc_list[[1]]), t(sc_list[[2]]), method = "DTW"), window.type = "itakura")
}
source("aux_scripts/primate_cerebral_organoids_pt_alignment.r")

xcv = align_pt_traj(expr_ref = hep_cells$healthy@assays$SCT@data[genes,], 
                    pt_ref = traj_list$healthy,
                    expr_query = hep_cells$embolised@assays$SCT@data[genes,], 
                    pt_query = traj_list$embolised, num_breaks_ref = 100, 
                    ref_name = "healthy", query_name = "embolised")

plot(traj_list$embolised, xcv$pt_query_aligned)
abline(0,1)

xcb = align_pt_traj(expr_ref = hep_cells$healthy@assays$SCT@data[genes,], 
                    pt_ref = traj_list$healthy,
                    expr_query = hep_cells$regenerating@assays$SCT@data[genes,], 
                    pt_query = traj_list$regenerating, num_breaks_ref = 100, 
                    ref_name = "healthy", query_name = "regenerating")

plot(traj_list$regenerating, xcb$pt_query_aligned)
abline(0,1)
g = "CLEC1B"
plt_df = data.frame("traj" = c(traj_list$regenerating, traj_list$healthy),
           "exp" = c(end_cells$regenerating@assays$SCT@data[g,names(traj_list$regenerating)],
           end_cells$healthy@assays$SCT@data[g,]),
           "cond" = c(rep("regenerating", length(traj_list$regenerating)),
                      rep("healthy", length(traj_list$healthy))),
           "cl" = c(end_cells$regenerating@meta.data$allcells_simp,
                    end_cells$healthy@meta.data$allcells_simp))
plt_df = plt_df[order(plt_df$traj, decreasing = T),]

ggplot(plt_df, aes(x = traj, y = exp, group = cond, colour = cond))+
  geom_point()+
  stat_smooth()+
  labs(title = g)+
  theme_bw()

ggplot(plt_df, aes(x = traj, y = exp, group = cond, colour = cl, shape = cond))+
  geom_point()+
  stat_smooth()+
  labs(title = g)+
  theme_bw()

xxx = dtw::dtw(x = plt_df[plt_df$cond=="healthy","exp"], y = plt_df[plt_df$cond=="regenerating","exp"])
plot(xxx)
abline(0,1)

hti = cut_number(plt_df$traj[plt_df$cond=="healthy"], 100)
rti = cut_number(plt_df$traj[plt_df$cond=="regenerating"], 100)
df2 = data.frame("r" = tapply(plt_df$exp[plt_df$cond=="regenerating"], rti, mean),
                 "h" = tapply(plt_df$exp[plt_df$cond=="healthy"], hti, mean))
yyy = dtw::dtw(x = df2$h, y = df2$r)
plot(yyy)
abline(0,1)

Get GO Term enrichment

library(topGO)
cor_lists = list("hep" = hep_cor_list, "end" = end_cor_list)

cc = -0.25
go_enr_list = list()
for(gr in names(cor_lists)){
  cor_list = cor_lists[[gr]]

  go_enr_list[[gr]] = list()
  for(comp in names(cor_list)){
    testdf = cor_list[[comp]]
    
    
    
    ncond = strsplit(comp, "_")[[1]]
    go_enr_list[[gr]][[comp]] = list()
    for(cond in ncond){
      gsig = names(hep_qval[[cond]])[hep_qval[[cond]]<=0.05]
      
      subtestdf = merge(testdf, hep_qval[[cond]], by = 0)
      
      genes = subtestdf$X..i..
      names(genes) = subtestdf[,1]
      
      # GO enrichment
      fsel = function(x) return(x<cc & subtestdf$y<=0.05)
      
      sampleGOdata <- new("topGOdata",
                          description = "test", ontology = "BP",
                          allGenes = genes, geneSel = fsel,
                          nodeSize = 5, annot = annFUN.org, 
                          mapping = "org.Hs.eg.db", ID = "symbol")
      
      resultKS.elim <- runTest(sampleGOdata, algorithm = "elim", 
                               statistic = "fisher")
      
      resGO = GenTable(object = sampleGOdata, elimKS = resultKS.elim,
                       topNodes = resultKS.elim@geneData[4], numChar = 1000)
      resGO$elimKS = score(resultKS.elim)[resGO$GO.ID]
      resGO$cond = cond
      
      # get genes
      relevant_genes = sampleGOdata@allGenes[sampleGOdata@geneSelectionFun(sampleGOdata@allScores)]
      goGenes = t(data.frame(lapply(genesInTerm(sampleGOdata), function(x) paste(x[x %in% relevant_genes], collapse = ", "))))
      rownames(goGenes) = gsub(".", ":", rownames(goGenes), fixed = T)
      colnames(goGenes) = "genes"
      resGO = merge(resGO, goGenes, by.x = 1, by.y = 0, all.x = T)
      
      if(cond==ncond[[1]]){
        go_enr_list[[gr]][[comp]] = resGO[,c("GO.ID","Term","Annotated","Significant",
                                             "Expected","elimKS","cond","genes")]
      } else{
        go_enr_list[[gr]][[comp]] = rbind(go_enr_list[[gr]][[comp]],
                                          resGO[,c("GO.ID","Term","Annotated","Significant",
                                                   "Expected","elimKS","cond","genes")])
      }
      
    }
  }
}
saveRDS(go_enr_list, file = "results/zonation_cond/go_enr_corsig_list.RDS")
gg = unique(c(names(end_qval$healthy)[order(end_qval$healthy, decreasing = F)][1:1000],
              names(end_qval$regenerating)[order(end_qval$regenerating, decreasing = F)][1:1000]))
diff_fits = apply(end_fits$healthy[,gg], 2, rank)-apply(end_fits$regenerating[,gg], 2, rank)
cl = hclust(dist(t(diff_fits)), method = "ward.D")
ct = cutree(cl, 6)
df = data.frame("cor" = end_cor_fit_list$healthy_regenerating[gg,2],
                 "score" = apply(apply(t(diff_fits), 1, range), 
                                 2, function(x) sum(abs(x))))
diff_genes = rownames(df)[df$cor<0.25]
cl = hclust(dist(t(diff_fits)[diff_genes,]), method = "ward.D")
ct = cutree(cl, 4)
df = data.frame("cl" = factor(ct),
                "cor" = 1-end_cor_fit_list$healthy_regenerating[diff_genes,2],
                 "score" = apply(apply(t(diff_fits)[diff_genes,], 1, range), 
                                 2, function(x) sum(abs(x))))
pheatmap::pheatmap(t(diff_fits)[diff_genes,], cluster_cols = F, 
                   clustering_method = "ward.D", fontsize_row = 8, annotation_row = df)

plot(hep_fits$healthy[,"SAA2"])
plot(hep_fits$regenerating[,"SAA2"])
fsel = function(x) return(x<0.25 & names(genes) %in% gg)
fsel = function(x) return(names(genes) %in% gg)
genes = end_cor_fit_list$healthy_regenerating$X..i..
names(genes) = end_cor_fit_list$healthy_regenerating[,1]
genes = genes[!is.na(genes)]

eg = clusterProfiler::bitr(names(genes[fsel(genes)]), fromType="SYMBOL", 
                           toType="ENTREZID", OrgDb="org.Hs.eg.db")
eg_all = clusterProfiler::bitr(names(genes), fromType="SYMBOL", 
                           toType="ENTREZID", OrgDb="org.Hs.eg.db")

x <- ReactomePA::enrichPathway(gene=eg$ENTREZID,pvalueCutoff=0.05, 
                               readable=T, universe = eg_all$ENTREZID)
clusterProfiler::dotplot(x)
head(as.data.frame(x))

ego3 <- clusterProfiler::enrichGO(gene = eg$ENTREZID, universe = eg_all$ENTREZID, 
                               OrgDb = org.Hs.eg.db, ont = "BP", pvalueCutoff  = 0.01,
                               qvalueCutoff  = 0.05, 
                               pAdjustMethod = "BH", readable = T)
clusterProfiler::dotplot(ego3)
gg = unique(c(names(qvals_b_list$healthy)[order(qvals_b_list$healthy, decreasing = F)][1:1000],
              names(qvals_b_list$regenerating)[order(qvals_b_list$regenerating, decreasing = F)][1:1000]))
diff_fits = apply(fits_b_list$healthy[,gg], 2, rank)-apply(fits_b_list$regenerating[,gg], 2, rank)
cl = hclust(dist(t(diff_fits)), method = "ward.D")
ct = cutree(cl, 6)
df = data.frame("cor" = hep_cor_fit_list$healthy_regenerating[gg,2],
                 "score" = apply(apply(t(diff_fits), 1, range), 
                                 2, function(x) sum(abs(x))))
df = df[complete.cases(df),]
diff_genes = rownames(df)[df$cor<0.25]
cl = hclust(dist(t(diff_fits)[diff_genes,]), method = "ward.D")
ct = cutree(cl, 4)
df = data.frame("cl" = factor(ct),
                "cor" = 1-hep_cor_fit_list$healthy_regenerating[diff_genes,2],
                 "score" = apply(apply(t(diff_fits)[diff_genes,], 1, range), 
                                 2, function(x) sum(abs(x))))
pheatmap::pheatmap(t(diff_fits)[diff_genes,], cluster_cols = F, 
                   clustering_method = "ward.D", fontsize_row = 8, annotation_row = df)

plot(hep_fits$healthy[,"CPS1"])
plot(hep_fits$regenerating[,"CPS1"])
genes_healthy = names(qvals_b_list$healthy)[order(qvals_b_list$healthy, decreasing = F)][qvals_b_list$healthy[order(qvals_b_list$healthy, decreasing = F)]<=0.05]
genes_regen = names(qvals_b_list$regenerating)[order(qvals_b_list$regenerating, decreasing = F)][qvals_b_list$regenerating[order(qvals_b_list$regenerating, decreasing = F)]<=0.05]
gg = unique(c(genes_healthy, genes_regen))

eg = clusterProfiler::bitr(gg, fromType="SYMBOL", 
                           toType="ENTREZID", OrgDb="org.Hs.eg.db")
eg_all = clusterProfiler::bitr(names(qvals_b_list$healthy), fromType="SYMBOL", 
                           toType="ENTREZID", OrgDb="org.Hs.eg.db")

reac = ReactomePA::enrichPathway(gene=eg$ENTREZID,pvalueCutoff=0.05, 
                               readable=T, universe = eg_all$ENTREZID)

got = clusterProfiler::enrichGO(gene = eg$ENTREZID, universe = eg_all$ENTREZID, 
                                OrgDb = org.Hs.eg.db, ont = "BP", pvalueCutoff  = 0.01,
                                qvalueCutoff  = 0.05, 
                                pAdjustMethod = "BH", readable = T)

all_terms = rbind(as.data.frame(reac), as.data.frame(got))
all_terms$DB = c(rep("Reactome", nrow(as.data.frame(reac))), rep("GO Term", nrow(as.data.frame(got))))
all_terms = all_terms[all_terms$qvalue<=0.05,]
all_terms_noribo = all_terms[!grepl("RPL", all_terms$geneID) & !grepl("RPS", all_terms$geneID),]

nhealthy = c()
nregen = c()
corhealthy = c()
corregen = c()
mean_traj_h = matrix(0, nrow(all_terms_noribo), 100)
mean_traj_r = matrix(0, nrow(all_terms_noribo), 100)
cor_mean = c()
for(i in 1:nrow(all_terms_noribo)){
  genes_path = unlist(strsplit(all_terms_noribo[i,"geneID"],"/"))
  in_healthy = genes_path[genes_path %in% genes_healthy]
  in_regen = genes_path[genes_path %in% genes_regen]
  
  nhealthy = c(nhealthy, length(in_healthy))
  nregen = c(nregen, length(in_regen))
  
  if(length(in_healthy)>1){
    ch = cor(fits_b_list$healthy[,in_healthy], method = "sp")
    corhealthy = c(corhealthy, median(ch[upper.tri(ch)])) 
  } else{ 
    corhealthy = c(corhealthy,1)
  }
  if(length(in_regen)>1){
    ch = cor(fits_b_list$regen[,in_regen], method = "sp")
    corregen = c(corregen, median(ch[upper.tri(ch)])) 
  } else{ 
    corregen = c(corregen,1)
  }
  
  mean_traj_h[i,] = if(length(in_healthy)>1){ rowMeans(fits_b_list$healthy[,in_healthy])} else if(length(in_healthy)>0){fits_b_list$healthy[,in_healthy]} else {rep(0,100)}
  mean_traj_r[i,] = if(length(in_regen)>1){ rowMeans(fits_b_list$regenerating[,in_regen])} else if(length(in_regen)>0){fits_b_list$regenerating[,in_regen]} else {rep(0,100)}
  
  cor_mean = c(cor_mean, cor(mean_traj_h[i,], mean_traj_r[i,]))
}
all_terms_noribo = cbind(all_terms_noribo, nhealthy, nregen, corhealthy, corregen, cor_mean)
rownames(mean_traj_h) = rownames(mean_traj_r) = all_terms_noribo$Description


View(all_terms_noribo[all_terms_noribo$corhealthy>=0.25 & all_terms_noribo$corregen>=0.25 &
                        all_terms_noribo$nhealthy>=2 & all_terms_noribo$nregen>=2,])
i = which(all_terms_noribo$Description=="mRNA Splicing - Minor Pathway")
genes_healthy = names(qvals_b_list$healthy)[qvals_b_list$healthy<=0.05]
genes_regen = names(qvals_b_list$regenerating)[qvals_b_list$regenerating<=0.05]
genes_conds = list("healthy" = setdiff(genes_healthy, genes_regen), 
                   "regen" = setdiff(genes_regen, genes_healthy),
                   "common" = intersect(genes_healthy, genes_regen))

terms_list = list()
for(n in names(genes_conds)){
  eg = clusterProfiler::bitr(genes_conds[[n]], fromType="SYMBOL", 
                           toType="ENTREZID", OrgDb="org.Hs.eg.db")
  eg_all = clusterProfiler::bitr(names(qvals_b_list$healthy), fromType="SYMBOL", 
                             toType="ENTREZID", OrgDb="org.Hs.eg.db")
  
  reac = ReactomePA::enrichPathway(gene=eg$ENTREZID,pvalueCutoff=0.05, 
                                 readable=T, universe = eg_all$ENTREZID)
  
  got = clusterProfiler::enrichGO(gene = eg$ENTREZID, universe = eg_all$ENTREZID, 
                                  OrgDb = org.Hs.eg.db, ont = "BP", pvalueCutoff  = 0.01,
                                  qvalueCutoff  = 0.05, 
                                  pAdjustMethod = "BH", readable = T)
  
  all_terms = rbind(as.data.frame(reac), as.data.frame(got))
  all_terms$DB = c(rep("Reactome", nrow(as.data.frame(reac))), rep("GO Term", nrow(as.data.frame(got))))
  all_terms = all_terms[all_terms$qvalue<=0.05,]
  all_terms_noribo = all_terms[!grepl("RPL", all_terms$geneID) & !grepl("RPS", all_terms$geneID),]
  
  nhealthy = c()
  nregen = c()
  corhealthy = c()
  corregen = c()
  mean_traj_h = matrix(0, nrow(all_terms_noribo), 100)
  mean_traj_r = matrix(0, nrow(all_terms_noribo), 100)
  cor_mean = c()
  for(i in 1:nrow(all_terms_noribo)){
    genes_path = unlist(strsplit(all_terms_noribo[i,"geneID"],"/"))
    in_healthy = genes_path[genes_path %in% genes_healthy]
    in_regen = genes_path[genes_path %in% genes_regen]
    
    nhealthy = c(nhealthy, length(in_healthy))
    nregen = c(nregen, length(in_regen))
    
    if(length(in_healthy)>1){
      ch = cor(fits_b_list$healthy[,in_healthy], method = "sp")
      corhealthy = c(corhealthy, median(ch[upper.tri(ch)])) 
    } else{ 
      corhealthy = c(corhealthy,0)
    }
    if(length(in_regen)>1){
      ch = cor(fits_b_list$regen[,in_regen], method = "sp")
      corregen = c(corregen, median(ch[upper.tri(ch)])) 
    } else{ 
      corregen = c(corregen,0)
    }
    
    mean_traj_h[i,] = if(length(in_healthy)>1){ rowMeans(fits_b_list$healthy[,in_healthy])} else if(length(in_healthy)>0){fits_b_list$healthy[,in_healthy]} else {rep(0,100)}
    mean_traj_r[i,] = if(length(in_regen)>1){ rowMeans(fits_b_list$regenerating[,in_regen])} else if(length(in_regen)>0){fits_b_list$regenerating[,in_regen]} else {rep(0,100)}
    
    cor_mean = c(cor_mean, cor(mean_traj_h[i,], mean_traj_r[i,]))
  }
  all_terms_noribo = cbind(all_terms_noribo, nhealthy, nregen, corhealthy, corregen, cor_mean)
  rownames(mean_traj_h) = rownames(mean_traj_r) = all_terms_noribo$Description
  
  terms_list[[n]] = all_terms_noribo
}
cl = hclust(dist(t(diff_fits)), method = "ward.D")
ct = cutree(cl, 1)

term_list = list()
for(i in unique(ct)){
  fsel = function(x) return(x<0.25 & names(genes) %in% names(ct)[ct==i])
  genes = hep_cor_fit_list$healthy_regenerating$X..i..
  names(genes) = hep_cor_fit_list$healthy_regenerating[,1]
  genes = genes[!is.na(genes)]
  
  eg = clusterProfiler::bitr(names(genes[fsel(genes)]), fromType="SYMBOL", 
                             toType="ENTREZID", OrgDb="org.Hs.eg.db")
  eg_all = clusterProfiler::bitr(names(genes), fromType="SYMBOL", 
                             toType="ENTREZID", OrgDb="org.Hs.eg.db")
  
  x <- ReactomePA::enrichPathway(gene=eg$ENTREZID,pvalueCutoff=0.05, 
                                 readable=T, universe = eg_all$ENTREZID)
  
  term_list[[paste0("reac_",i)]] = x
  
  ego3 <- clusterProfiler::enrichGO(gene = eg$ENTREZID, universe = eg_all$ENTREZID, 
                                 OrgDb = org.Hs.eg.db, ont = "BP", pvalueCutoff  = 0.01,
                                 qvalueCutoff  = 0.05, 
                                 pAdjustMethod = "BH", readable = T)
  
  term_list[[paste0("go_",i)]] = ego3
}
genes = hep_cor_fit_list$healthy_regenerating$X..i..
names(genes) = hep_cor_fit_list$healthy_regenerating[,1]
genes = genes[!is.na(genes)]

# GO enrichment
fsel = function(x) return(x<0.25 & names(genes) %in% gg)

sampleGOdata <- new("topGOdata",
                    description = "test", ontology = "BP",
                    allGenes = genes, geneSel = fsel,
                    nodeSize = 5, annot = annFUN.org, 
                    mapping = "org.Hs.eg.db", ID = "symbol")

resultKS.elim <- runTest(sampleGOdata, algorithm = "elim", 
                         statistic = "fisher")

resGO = GenTable(object = sampleGOdata, elimKS = resultKS.elim,
                 topNodes = resultKS.elim@geneData[4], numChar = 1000)
resGO$elimKS = score(resultKS.elim)[resGO$GO.ID]


# get genes
relevant_genes = sampleGOdata@allGenes[sampleGOdata@geneSelectionFun(sampleGOdata@allScores)]
goGenes = t(data.frame(lapply(genesInTerm(sampleGOdata), function(x) paste(x[x %in% relevant_genes], collapse = ", "))))
rownames(goGenes) = gsub(".", ":", rownames(goGenes), fixed = T)
colnames(goGenes) = "genes"
resGO = merge(resGO, goGenes, by.x = 1, by.y = 0, all.x = T)
subresGO = resGO[!(grepl("RPL", resGO$genes) & grepl("RPS", resGO$genes)),]
sig_resGO = resGO[resGO$elimKS<=0.05,]
hprof_go = list()
rprof_go = list()
for(l in 1:nrow(sig_resGO)){
  genes_go = unlist(strsplit(as.character(sig_resGO[l,"genes"]), ", "))
  
  hprof_go[[sig_resGO[l,2]]] = t(hep_fits$healthy[,genes_go])
  rprof_go[[sig_resGO[l,2]]] = t(hep_fits$regenerating[,genes_go])
}
hmax = unlist(lapply(hprof_go, function(x) which.max(colSums(x))))
rmax = unlist(lapply(rprof_go, function(x) which.max(colSums(x))))
names(hmax) = names(rmax) = sig_resGO$Term
sig_resGO = sig_resGO[order(sig_resGO$elimKS, decreasing = F),]
go_short_list = list()
for(i in 1:nrow(sig_resGO)){
  g = unlist(strsplit(as.character(sig_resGO[i,"genes"]), ", "))
  if(!any(g %in% unlist(go_short_list)) & length(g)>1){
    go_short_list[[sig_resGO[i,2]]] = g
  }
}

res = matrix(0, 1, 100)
for(n in names(go_short_list)){
  if(length(go_short_list[[n]])>1){
    cl = hclust(dist(t(diff_fits[,go_short_list[[n]]])), method = "ward.D")
    res = rbind(res, t(diff_fits[,go_short_list[[n]]])[cl$order,])
  }
}
res = res[-1,]


pheatmap::pheatmap(res, cluster_cols = F, cluster_rows = F, fontsize_row = 8, gaps_row = cumsum(unlist(lapply(go_short_list, length))))
sig_resGO = sig_resGO[order(sig_resGO$elimKS, decreasing = F),]
go_med = matrix(0, 1, 100)
for(i in 1:nrow(sig_resGO)){
  g = unlist(strsplit(as.character(sig_resGO[i,"genes"]), ", "))
  if(length(g)>1){
    go_med = rbind(go_med, apply(t(diff_fits[,g]), 2, median))
    rownames(go_med)[nrow(go_med)] = sig_resGO[i,2]
  }
}
go_med = go_med[-1,]
eg = clusterProfiler::bitr(names(genes[fsel(genes)]), fromType="SYMBOL", 
                           toType="ENTREZID", OrgDb="org.Hs.eg.db")
eg_all = clusterProfiler::bitr(names(genes), fromType="SYMBOL", 
                           toType="ENTREZID", OrgDb="org.Hs.eg.db")

x <- as.data.frame(ReactomePA::enrichPathway(gene=eg$ENTREZID,pvalueCutoff=0.05, 
                                             readable=T, universe = eg_all$ENTREZID))
head(as.data.frame(x))

go_med = matrix(0, 1, 100)
for(i in 1:nrow(x)){
  g = unlist(strsplit(as.character(x[i,"geneID"]), "/"))
  if(length(g)>1){
    go_med = rbind(go_med, apply(t(diff_fits[,g]), 2, median))
    rownames(go_med)[nrow(go_med)] = x[i,2]
  }
}
go_med = go_med[-1,]

genes_reac = strsplit(x$geneID, "/")
names(genes_reac) = x$Description
genes_reac = reshape2::melt(genes_reac)
dat = genes_reac[!duplicated(genes_reac$value),]
dat = data.frame(row.names = dat[,1], "Reac" = dat[,2])
pheatmap::pheatmap(t(diff_fits)[diff_genes,], cluster_cols = F, 
                   clustering_method = "ward.D", fontsize_row = 8, annotation_row = dat)

hprof_re = list()
rprof_re = list()
for(l in 1:nrow(x)){
  genes_go = unlist(strsplit(as.character(x[l,"geneID"]), "/"))
  
  hprof_re[[x[l,2]]] = t(hep_fits$healthy[,genes_go])
  rprof_re[[x[l,2]]] = t(hep_fits$regenerating[,genes_go])
}


hmet = ggplot(reshape2::melt(t(apply(hprof_re$`Metallothioneins bind metals`, 1, scale))), 
              aes(x = Var2, y = value, group = Var1, colour = Var1))+
  geom_line()+labs(title = "Healthy - Metallothioneins bind metals")
rmet = ggplot(reshape2::melt(t(apply(rprof_re$`Metallothioneins bind metals`, 1, scale))), 
              aes(x = Var2, y = value, group = Var1, colour = Var1))+
  geom_line()+labs(title = "Regen - Metallothioneins bind metals")
rlip = ggplot(reshape2::melt(t(apply(rprof_re$`Plasma lipoprotein remodeling`, 1, scale))), 
              aes(x = Var2, y = value, group = Var1, colour = Var1))+
  geom_line()+labs(title = "Regen - Plasma lipoprotein remodeling")
hlip = ggplot(reshape2::melt(t(apply(hprof_re$`Plasma lipoprotein remodeling`, 1, scale))), 
              aes(x = Var2, y = value, group = Var1, colour = Var1))+
  geom_line()+labs(title = "Healthy - Plasma lipoprotein remodeling")
cowplot::plot_grid(hmet, rmet, hlip, rlip, ncol = 2, row = 2, align = "hv")









ego3 <- clusterProfiler::enrichGO(gene = eg$ENTREZID, universe = eg_all$ENTREZID, 
                               OrgDb = org.Hs.eg.db, ont = "BP", pvalueCutoff  = 0.01,
                               qvalueCutoff  = 0.05, 
                               pAdjustMethod = "BH", readable = T)


genes_ent = merge(eg_all, genes, by.x = 1, by.y = 0)
tmp = genes_ent[,2]
genes_ent = genes_ent[,3]
names(genes_ent) = tmp
ego4 <- clusterProfiler::gseGO(geneList = genes_ent[order(genes_ent, decreasing = T)], 
                               OrgDb = org.Hs.eg.db, ont = "BP", nPerm = 1000, minGSSize = 100,
                               maxGSSize = 500, pvalueCutoff = 0.05, verbose = FALSE)
plot_genes_fits = cbind(hep_fits$healthy[,c("SAA1", "HAMP", "CYP3A4", "GLUL", "CPS1", "ARG1")],
                        hep_fits$embolised[,c("SAA1", "HAMP", "CYP3A4", "GLUL", "CPS1", "ARG1")],
                        hep_fits$regenerating[,c("SAA1", "HAMP", "CYP3A4", "GLUL", "CPS1", "ARG1")])
plot_genes_fits = apply(plot_genes_fits, 2, c)

colnames(plot_genes_fits) = paste0(colnames(plot_genes_fits), 
                                   rep(c("_healthy", "_embolised", "_regenerating"), each = 6))
plot_genes_fits = reshape2::melt(plot_genes_fits)

plot_genes_fits$cond = unlist(lapply(strsplit(as.character(plot_genes_fits$Var2), "_"), function(x) x[2]))

plot_genes_fits$cond = factor(plot_genes_fits$cond, levels = c("healthy", "embolised", "regenerating"))

plot_genes_fits$genes = unlist(lapply(strsplit(as.character(plot_genes_fits$Var2), "_"), function(x) x[1]))

ggplot(plot_genes_fits, aes(x = Var1, y = value, colour = genes))+facet_wrap(~cond)+geom_line()
all_path = terms_groups$all[rpfilter(terms_groups$all),]
all_path = all_path[order(all_path$qvalue, decreasing = F),]

tab_go = reshape2::melt(lapply(df_cor$gene, function(g) all_path[which(grepl(g, all_path$geneID) & all_path$DB=="GO Term")[1],c("Description", "DB", "qvalue")][1,]))
tab_go$gene = df_cor$gene
tab_go = tab_go[,c(6,1,2,4)]
tab_go$de = apply(df_cor[,c(1,3,4,5)], 1, function(x) colnames(df_cor[,c(3,4,5)])[which(as.logical(x[2:4]))])
tab_go$de[nchar(tab_go$de)>10] = NA
tab_go$de = unlist(tab_go$de)
tab_go$corr = df_cor$corr
tab_re = reshape2::melt(lapply(df_cor$gene, function(g) all_path[which(grepl(g, all_path$geneID) & all_path$DB=="Reactome")[1],c("Description", "DB", "qvalue")][1,]))
tab_re$gene = df_cor$gene
tab_re = tab_re[,c(6,1,2,4)]
tab_re$de = apply(df_cor[,c(1,3,4,5)], 1, function(x) colnames(df_cor[,c(3,4,5)])[which(as.logical(x[2:4]))])
tab_re$de[nchar(tab_re$de)>10] = NA
tab_re$de = unlist(tab_re$de)
tab_re$corr = df_cor$corr

tab_all = rbind(tab_go, tab_re)
tab_all = tab_all[complete.cases(tab_all),]

xxx = table(tab_all$Description, tab_all$de)
xxx = xxx[rowSums(xxx)>2,]
xxx = xxx/rowSums(xxx)
View(apply(xxx, 1, function(x) ifelse(any(x>0.5), colnames(xxx)[x>0.5], NA)))

aaa = cbind(tapply(tab_all$corr, tab_all$Description, function(x) sum(x>=0.25)/length(x)), 
            tapply(tab_all$corr, tab_all$Description, function(x) length(x)),
            tapply(tab_all$de, tab_all$Description, function(x) names(table(x))[which.max(table(x))]))

Calculate distances for all significant genes

for(cond in names(qvals_list)){
  gene_fit_p = qvals_list[[cond]]
  gene_fit_vals = fits_list[[cond]]
  gene_fit_vals_filt = t(gene_fit_vals[,names(gene_fit_p)[gene_fit_p<=0.05]])
  scale_fit = t(apply(gene_fit_vals_filt, 1, scale))
  
  # dist
  if(file.exists(paste0("results/zonation_cond/dtw_distance_end_", cond, "_sigGenes.RDS"))){
    next
  } else{
    dd = dtw::dtwDist(scale_fit) # TAKES 4.5 HOURS
    saveRDS(dd, paste0("results/zonation_cond/dtw_distance_end_", cond, "_sigGenes.RDS"))
  }
}

Cluster genes (per condition)

top_n = 1000
for(cond in names(qvals_list)){
  print(cond)
  gene_fit_p = qvals_list[[cond]]
  gene_fit_vals = fits_list[[cond]]
  
  dd = readRDS(paste0("results/zonation_cond/dtw_distance_end_", cond, "_sigGenes.RDS"))
  
  gene_fit_vals_filt = t(gene_fit_vals[,names(gene_fit_p)[gene_fit_p<=0.05]])
  scale_fit = t(apply(gene_fit_vals_filt, 1, scale))
  
  # select top genes (0 if using all)
  topgenes = if(top_n==0) names(gene_fit_p) else names(head(gene_fit_p[order(gene_fit_p,
                                                                             decreasing = F)],
                                                            top_n))
  
  clust_genes = hclust(as.dist(dd[topgenes,topgenes]), method = "ward.D2")
  
  cls = cutree(clust_genes, 4)
  #ape::plot.phylo(ape::as.phylo(clust_genes), tip.color = rainbow(10)[cls])
  plt_list = list()
  for(i in unique(cls)){
    sub_df = scale_fit[names(cls)[cls==i],]
    plot_df = reshape2::melt(sub_df)
    plt_list[[paste0("gene_",i)]] = ggplot(plot_df, aes(x = Var2, y = value, group = Var1))+
      geom_hline(yintercept = 0, linetype = "dashed")+
      geom_line(alpha = 0.1, colour = "grey69")+
      stat_summary(aes(y = value, group=1), fun=mean, colour="red", geom="line",group=1)+
      labs(title = paste0("gene_",i))+
      theme_bw()
  }
  
  pdf(paste0("results/zonation_cond/end_gene_expression_profile_", cond, ".pdf"),
      height = 4.5, width = 4.4)
  print(cowplot::plot_grid(plotlist = plt_list, nrow = 2, ncol = 2, align = "hv"))
  dev.off()
  
  df_pval_cl = merge(data.frame(gene_fit_p), data.frame(cls), by = 0, all = T)
  colnames(df_pval_cl) = c("gene", "pval", "cluster")
  
  write.csv(df_pval_cl, file = paste0("results/zonation_cond/end_pval_clust_", cond, ".csv"),
            col.names = T, row.names = F, quote = F)

  write.csv(gene_fit_vals, 
            file = paste0("results/zonation_cond/end_gene_fits_", cond, ".csv"),
            col.names = T, row.names = F, quote = F)
}

Calculate distances for all significant genes

for(cond in names(qvals_list)){
  gene_fit_p = qvals_list[[cond]]
  gene_fit_vals = fits_list[[cond]]
  gene_fit_vals_filt = t(gene_fit_vals[,names(gene_fit_p)[gene_fit_p<=0.05]])
  scale_fit = t(apply(gene_fit_vals_filt, 1, scale))
  
  # dist
  if(file.exists(paste0("results/zonation_cond/dtw_distance_", cond, "_sigGenes.RDS"))){
    next
  } else{
    dd = dtw::dtwDist(scale_fit) # TAKES 4.5 HOURS
    saveRDS(dd, paste0("results/zonation_cond/dtw_distance_", cond, "_sigGenes.RDS"))
  }
}

Cluster genes (per condition)

top_n = 1000
for(cond in names(qvals_list)){
  print(cond)
  gene_fit_p = qvals_list[[cond]]
  gene_fit_vals = fits_list[[cond]]
  
  dd = readRDS(paste0("results/zonation_cond/dtw_distance_", cond, "_sigGenes.RDS"))
  
  gene_fit_vals_filt = t(gene_fit_vals[,names(gene_fit_p)[gene_fit_p<=0.05]])
  scale_fit = t(apply(gene_fit_vals_filt, 1, scale))
  
  # select top genes (0 if using all)
  topgenes = if(top_n==0) names(gene_fit_p) else names(head(gene_fit_p[order(gene_fit_p,
                                                                             decreasing = F)],
                                                            top_n))
  
  clust_genes = hclust(as.dist(dd[topgenes,topgenes]), method = "ward.D2")
  
  cls = cutree(clust_genes, 4)
  #ape::plot.phylo(ape::as.phylo(clust_genes), tip.color = rainbow(10)[cls])
  plt_list = list()
  for(i in unique(cls)){
    sub_df = scale_fit[names(cls)[cls==i],]
    plot_df = reshape2::melt(sub_df)
    plt_list[[paste0("gene_",i)]] = ggplot(plot_df, aes(x = Var2, y = value, group = Var1))+
      geom_hline(yintercept = 0, linetype = "dashed")+
      geom_line(alpha = 0.1, colour = "grey69")+
      stat_summary(aes(y = value, group=1), fun=mean, colour="red", geom="line",group=1)+
      labs(title = paste0("gene_",i))+
      theme_bw()
  }
  
  pdf(paste0("results/zonation_cond/hep_gene_expression_profile_", cond, ".pdf"),
      height = 4.5, width = 4.4)
  print(cowplot::plot_grid(plotlist = plt_list, nrow = 2, ncol = 2, align = "hv"))
  dev.off()
  
  df_pval_cl = merge(data.frame(gene_fit_p), data.frame(cls), by = 0, all = T)
  colnames(df_pval_cl) = c("gene", "pval", "cluster")
  
  write.csv(df_pval_cl, file = paste0("results/zonation_cond/hep_pval_clust_", cond, ".csv"),
            col.names = T, row.names = F, quote = F)

  write.csv(gene_fit_vals, file = paste0("results/zonation_cond/hep_gene_fits_", cond, ".csv"),
            col.names = T, row.names = F, quote = F)
}
---
title: "Zonation analysis V2"
output: html_notebook
---



# General Setup
Setup chunk

```{r, setup, include=FALSE}
knitr::opts_chunk$set(fig.width = 8)
knitr::opts_knit$set(root.dir = normalizePath(".."))
knitr::opts_knit$get("root.dir")
```

Setup reticulate

```{r}
library(reticulate)
knitr::knit_engines$set(python = reticulate::eng_python)
py_available(initialize = FALSE)
use_python(Sys.which("python"))
py_config()
```

Load libraries

```{r}
library(Seurat)
library(ggplot2)
library(ggridges)
library(dplyr)
library(destiny)
library(fdrtool)
library(gam)
library(gamclass)
library(dtw)
library(data.table)
library(clusterProfiler)
library(ReactomePA)
```

Function for cross-validation

```{r}
CVgam = function (formula, data, nfold = 10, debug.level = 0, method = "glm.fit", 
    printit = TRUE, cvparts = NULL, gamma = 1, seed = 29){
  # modified from the `gamclass` package to use gam::gam
    if (is.null(cvparts)) {
        set.seed(seed)
        cvparts <- sample(1:nfold, nrow(data), replace = TRUE)
    }
    folds <- unique(cvparts)
    khat <- hat <- numeric(nrow(data))
    scale.gam <- summary(gam::gam(formula, data = data, method = method))$scale
    for (i in folds) {
        trainrows <- cvparts != i
        testrows <- cvparts == i
        elev.gam <- gam::gam(formula, data = data[trainrows, ], method = method, 
            gamma = gamma)
        hat[testrows] <- predict(elev.gam, newdata = data[testrows, 
            ], select = TRUE)
        res <- residuals(elev.gam)
    }
    y <- eval(formula[[2]], envir = as.data.frame(data))
    res <- y - hat
    cvscale <- sum(res^2)/length(res)
    prntvec <- c(GAMscale = scale.gam, `CV-mse-GAM ` = cvscale)
    if (printit) 
        print(round(prntvec, 4))
    invisible(list(fitted = hat, resid = res, cvscale = cvscale, 
        scale.gam = scale.gam))
}
```



# Load data
Load data (from all cells)

```{r}
allcells_css = readRDS(file = "data/processed/allcells_css.RDS")
```



# Hepatocytes
Subset and process each condition

```{r}
hep_cells = list()
for(cond in unique(allcells_css@meta.data$Condition)){
  cat(cond)
  hep_cells[[cond]] = allcells_css[,allcells_css@meta.data$allcells_major=="Hepatocytes" &
                                     allcells_css@meta.data$Condition==cond]
  print(dim(hep_cells[[cond]]))
  hep_cells[[cond]] = suppressWarnings(SCTransform(hep_cells[[cond]], do.correct.umi = T, 
                                                   verbose = F, 
                                                   vars.to.regress=c("unique_name","nCount_RNA"),
                                                   variable.features.rv.th = 1, seed.use = 1,
                                                   return.only.var.genes = F, 
                                                   variable.features.n = NULL))
  hep_cells[[cond]] = RunPCA(hep_cells[[cond]], verbose = F)
  hep_cells[[cond]] = RunUMAP(hep_cells[[cond]], dims = 1:20, verbose = F)
}
```


## Healthy analysis
For healthy cells (which will serve as reference), do additional filtering and reprocessing

```{r}
DimPlot(hep_cells$healthy, reduction = "umap", group.by = "unique_name")
outcells = (hep_cells$healthy@reductions$umap@cell.embeddings[,1]>(1) &
              hep_cells$healthy@reductions$umap@cell.embeddings[,2]>5)
hep_cells$healthy = hep_cells$healthy[,!outcells]

hep_cells$healthy = suppressWarnings(SCTransform(hep_cells$healthy, do.correct.umi = T, 
                                                 verbose = F,
                                                 vars.to.regress = c("unique_name","nCount_RNA"),
                                                 variable.features.rv.th = 1, seed.use = 1,
                                                 return.only.var.genes = F, 
                                                 variable.features.n = NULL))
hep_cells$healthy = RunPCA(hep_cells$healthy, verbose = F)
hep_cells$healthy = RunUMAP(hep_cells$healthy, dims = 1:15, verbose = F)
DimPlot(hep_cells$healthy, reduction = "umap", group.by = "unique_name")

hep_cells$healthy = FindNeighbors(hep_cells$healthy, reduction = "pca", dims = 1:15,
                                  force.recalc = T, graph.name = "hep")
hep_cells$healthy = FindClusters(hep_cells$healthy, graph.name = "hep", algorithm = 4, 
                                 resolution = seq(0.1,0.8,0.1), verbose = F)
DimPlot(hep_cells$healthy, reduction = "umap", group.by = "hep_res.0.3")
hep_cells$healthy = SetIdent(hep_cells$healthy, value = "hep_res.0.6")
mk_hep_h = FindAllMarkers(hep_cells$healthy, assay = "SCT")

saveRDS(hep_cells$healthy, file = "results/zonation_healthy/hep_healthy_srat.RDS")
```

Save hepatocytes

```{r}
saveRDS(hep_cells, file="results/zonation_cond/hep_cells.RDS")
```

Plot expression of zonation markers

```{r}
gfresh = c("CYP1A2", "CYP2E1", "GLUL", "CYP3A4", "G0S2", "APOC1", "UGT2B17", "BCHE", # central 
           "SAA1", "HAMP", "SAA2", "CPS1", "C3", "C1R", "MT-ND4", "MT-CO2") # portal

png("results/zonation_healthy/healthy_hepatocyte_zonation_markers_violin.png", 
    height = 1400, width = 1400)
VlnPlot(hep_cells$healthy, features = gfresh, group.by = "names_clusters", 
        pt.size = 0, sort = "increasing")
dev.off()
png("results/zonation_healthy/healthy_hepatocyte_zonation_markers_umap.png", 
    height = 1400, width = 1400)
FeaturePlot(hep_cells$healthy, features = gfresh, pt.size = 0.6)
dev.off()
```

Obtain Diffusion maps projection

```{r}
set.seed(1)
dm = DiffusionMap(hep_cells$healthy@reductions$pca@cell.embeddings[,1:4], 
                  rotate = T, n_eigs = 5)
dpt = DPT(dm)
plot(dpt)
```

Check expression of some genes

```{r}
dpt_flat <- branch_divide(dpt, integer(0L))
root = min(dpt_flat@branch[, 1], na.rm = TRUE)
dpt_vals = destiny:::dpt_for_branch(dpt_flat, root)
df_dc = cbind(hep_cells$healthy@meta.data, 
              data.frame("DC1" = dpt@dm$DC1, "DC2" = dpt@dm$DC2, "DPT" = dpt_vals),
              Matrix::t(hep_cells$healthy@assays$SCT@data[gfresh,]))
colnames(df_dc) = gsub("-", ".", colnames(df_dc), fixed = T)

plt_list = list()
for(n in c(gsub("-", ".", gfresh, fixed = T), "names_clusters", "unique_name")){
  plt_list[[n]] = ggplot(df_dc, aes_string(x = "DC1", y = "DC2", colour = n))+
    geom_point()+
    theme_classic()
}

png("results/zonation_healthy/healthy_hepatocyte_zonation_markers_dc.png", 
    height = 900, width = 2800)
cowplot::plot_grid(plotlist = plt_list, nrow = 3, ncol = 6, align = "hv")
dev.off()
```

We'll be using DC1 as the pseudotime. This was the best approximation possible to the zonation gradient based on diffusion maps

```{r}
pdf("results/zonation_healthy/hep_distributions_dc1.pdf", useDingbats = F, 
    height = 4, width = 9)
plt1 = ggplot(df_dc, aes(x = DC1, y = names_clusters, 
                  group = names_clusters, fill = names_clusters))+
  geom_density_ridges2(alpha = 0.4)+
  theme_bw()+
  theme(legend.position = "none")

plt2 = ggplot(df_dc, aes(x = DC1, y = unique_name, 
                  group = unique_name, fill = unique_name))+
  geom_density_ridges2(alpha = 0.4)+
  theme_bw()+
  theme(legend.position = "none")

cowplot::plot_grid(plt1, plt2, ncol = 2, align = "h")
dev.off()
```

Find genes varying along the pseudotime

```{r}
# will only use variable genes
hvg = hep_cells$healthy@assays$SCT@var.features

# Fit GAM for each gene using pseudotime as independent variable.
t = df_dc$DC1
t = scales::rescale(rank(df_dc$DC1), c(0.00001, 0.99999))
gene_fit_p = c()
gene_fit_vals = data.frame(row.names = 1:100)
for(i in 1:length(hvg)){
  g = hvg[i]
  z = hep_cells$healthy@assays$SCT@data[g,]
  
  d = data.frame(z=z, t=t)
  tmp = suppressMessages(gam(z ~ ns(t, df = 3), data=d))
  
  # bins for model fitting
  bb = seq(min(t), max(t), length.out = 100)
  gene_fit_vals[,g] = suppressMessages(predict(tmp,
                                               newdata = data.frame(t = bb)))
  p = summary(tmp)$parametric.anova$`Pr(>F)`[1]
  gene_fit_p = c(gene_fit_p, p)
  names(gene_fit_p)[length(gene_fit_p)] = g
}

gene_fit_p = fdrtool::fdrtool(gene_fit_p, statistic="pvalue", plot=F, 
                              verbose=F, cutoff.method="pct0", pct0=0.9)$qval
```


## Analysis of other conditions
Now use the top genes to learn the pseudozonation coordinates and project the embolised and regenerating data

```{r}
sig_genes = gene_fit_p[gene_fit_p<=0.05]
top_sig_genes = names(sig_genes[order(sig_genes, decreasing = F)][1:1000])
top_sig_genes = top_sig_genes[top_sig_genes %in% rownames(hep_cells$embolised@assays$SCT@data) &
                                top_sig_genes %in%  rownames(hep_cells$regenerating@assays$SCT@data)]

data_gam_h = cbind(data.frame("DC1" = df_dc$DC1),
                   Matrix::t(hep_cells$health@assays$SCT@data[top_sig_genes,]))
#data_gam_h$DC1 = scales::rescale(-data_gam_h$DC1, c(0.00001, 0.99999))
data_gam_h$DC1 = scales::rescale(rank(data_gam_h$DC1), c(0.00001, 0.99999))
#bins_dc1 = cut(data_gam_h$DC1, 200)
#cells_use = unlist(tapply(1:length(bins_dc1), bins_dc1, function(x) sample(x, 12)))

gam_healthy = gam::gam(DC1 ~ ., data = data_gam_h, family = mgcv::betar(link = "logit", eps = 0.00001))
#gam_healthy = gam::gam(DC1 ~ ., data = data_gam_h)

hea_pred = predict(gam_healthy,
                   Matrix::t(hep_cells$healthy@assays$SCT@data[top_sig_genes,]), type = "response")
reg_pred = predict(gam_healthy,
                   Matrix::t(hep_cells$regenerating@assays$SCT@data[top_sig_genes,]), type = "response")
emb_pred = predict(gam_healthy,
                   Matrix::t(hep_cells$embolised@assays$SCT@data[top_sig_genes,]), type = "response")

plot(density(reg_pred), col = "salmon", ylim = c(0,3.25))
lines(density(emb_pred), col = "darkred")
lines(density(data_gam_h$DC1), col = "orange")
lines(density(hea_pred), col = "grey50")
legend(-0.073,32, legend = c("healthy (DC1)", "healthy (pred)", 
                         "embolised (pred)", "regenerating (pred)"),
       fill = c("green", "grey50", "red", "blue"))

plot_df = data.frame(vals = c(data_gam_h$DC1, emb_pred, reg_pred), 
                     Condition = c(rep("healthy", length(data_gam_h$DC1)), 
                                   rep("embolised", length(emb_pred)), 
                                   rep("regenerating", length(reg_pred))),
                     Donor = c(hep_cells$healthy$Donor, 
                               hep_cells$embolised$Donor, hep_cells$regenerating$Donor))
plot_df$bins100 = cut(plot_df$vals, 10)
plot_df$Condition = factor(plot_df$Condition, levels = c("healthy", "embolised", "regenerating"))
ggplot(plot_df, aes(x = bins100, fill = Condition))+
     geom_bar(position = "fill")+
     labs(x = "zonation (binned)", y = "proportion")+
     scale_y_continuous(expand = c(0,0))+
     scale_fill_manual(values = colcond)+
     guides(fill = guide_legend(title.position = "top"))+
     theme(axis.text.x = element_blank(),
           legend.title.align = 0,
           legend.position = "bottom")
ggplot(plot_df, aes(x = bins100, fill = Condition))+
     facet_wrap(~Condition)+
     geom_bar()+
     labs(x = "zonation (binned)", y = "Number of cells")+
     scale_y_continuous(expand = c(0,0))+
     scale_fill_manual(values = colcond)+
     coord_flip()+
     guides(fill = guide_legend(title.position = "top"))+
     theme(axis.text.y = element_blank(),
           legend.title.align = 0,
           legend.position = "bottom")
ggplot(plot_df, aes(x = vals, colour = Donor))+
     facet_wrap(~Condition)+
     geom_density()+
     labs(x = "zonation (binned)", y = "proportion")+
     theme_bw()+
     theme(legend.title.align = 0,
           legend.position = "bottom")

# data_gam_h$bins20 = cut(data_gam_h$DC1, 20)
# mean_bins = apply(data_gam_h[,2:501], 2, function(x) tapply(x, data_gam_h$bins20, mean))
# cor_reg_bins = cor(as.matrix(hep_cells$regenerating@assays$SCT@data[colnames(mean_bins),]), 
#                    t(mean_bins), method = "sp")
# max_cor_regen = apply(cor_reg_bins, 1, which.max)
# testdf = data.frame(bins = max_cor_regen, dons = hep_cells$regenerating$Donor)
# ggplot(testdf, aes(x = bins, fill = dons))+geom_bar()+facet_wrap(~dons, scales = "free_y")

reg_df = data.frame("pt" = reg_pred, "don" = hep_cells$regenerating$Donor)
reg_df = cbind(reg_df, Matrix::t(hep_cells$regenerating@assays$SCT@data[gfresh,]))
ggplot(reg_df, aes(x = pt, y = SAA1, colour = don))+
  geom_point()+
  theme_classic()+
  facet_wrap(~don)+
  geom_hline(yintercept = 3)+
  geom_vline(xintercept = c(0,1))

ggplot(reg_df, aes(x = SAA2, y = GLUL, colour = don))+
  geom_point()+
  theme_classic()+
  facet_wrap(~don)

traj_list = list("healthy" = data_gam_h$DC1, "embolised" = emb_pred, "regenerating" = reg_pred)
```

Calculate correlations

```{r}
cor_h = t(cor(traj_list$healthy, as.matrix(Matrix::t(hep_cells$healthy@assays$SCT@data)),
              method = "sp"))
cor_emb = t(cor(traj_list$embolised, as.matrix(Matrix::t(hep_cells$embolised@assays$SCT@data)),
                method = "sp"))
cor_reg = t(cor(traj_list$regenerating, as.matrix(Matrix::t(hep_cells$regenerating@assays$SCT@data)),
                method = "sp"))

l = list(cor_h, cor_emb, cor_reg)
all_corr = Reduce(function(x, y) merge(x,y, by = "rn", all = T), 
                  lapply(l, function(x) data.frame(x, rn = row.names(x))))
colnames(all_corr) = c("genes", "healthy", "embolised", "regenerating")

write.csv(all_corr, file = paste0("results/zonation_cond/hep_correlations_zonation_rank.csv"),
          col.names = T, row.names = F, quote = F)
```

Find varying genes - using all cells along the ranked pseudotime

```{r}
qvals_list = list()
fits_list = list()

# get HVG from all conditions
hvg = unique(c(hep_cells[["healthy"]]@assays$SCT@var.features,
               hep_cells[["embolised"]]@assays$SCT@var.features,
               hep_cells[["regenerating"]]@assays$SCT@var.features))
hvg = hvg[hvg %in% rownames(hep_cells[["healthy"]]@assays$SCT@data) &
            hvg %in% rownames(hep_cells[["embolised"]]@assays$SCT@data) &
            hvg %in% rownames(hep_cells[["regenerating"]]@assays$SCT@data)]

for(cond in names(hep_cells)){
  print(cond)
  # Fit GAM for each gene using pseudotime as independent variable.
  t = traj_list[[cond]]
  gene_fit_p = c()
  gene_fit_vals = data.frame(row.names = 1:100)
  for(i in 1:length(hvg)){
    g = hvg[i]
    z = hep_cells[[cond]]@assays$SCT@data[g,]
    
    d = data.frame(z=z, t=t)
    tmp = suppressMessages(gam(z ~ ns(t, df = 3), data=d))
    
    # bins for model fitting
    bb = seq(min(t), max(t), length.out = 100)
    gene_fit_vals[,g] = suppressMessages(predict(tmp,
                                                 newdata = data.frame(t = bb)))
    p = summary(tmp)$parametric.anova$`Pr(>F)`[1]
    gene_fit_p = c(gene_fit_p, p)
    names(gene_fit_p)[length(gene_fit_p)] = g
  }
  
  if(sum(is.na(gene_fit_p))>0){
    gene_fit_p[is.na(gene_fit_p)] = 1
  }
  gene_fit_p = fdrtool::fdrtool(gene_fit_p, statistic="pvalue", plot=F, 
                                verbose=F, cutoff.method="pct0", pct0=0.9)$qval
  
  qvals_list[[cond]] = gene_fit_p
  fits_list[[cond]] = gene_fit_vals
}
```

Save trajectories and genes

```{r}
save(fits_list, qvals_list, traj_list, 
     file = "results/zonation_cond/hep_traj_qval_fits_rank.RData")
```

Find varying genes - normalised by bins along the ranked pseudotime

```{r}
qvals_b_list = list()
fits_b_list = list()

# get HVG from all conditions
hvg = unique(c(hep_cells[["healthy"]]@assays$SCT@var.features,
               hep_cells[["embolised"]]@assays$SCT@var.features,
               hep_cells[["regenerating"]]@assays$SCT@var.features))
hvg = hvg[hvg %in% rownames(hep_cells[["healthy"]]@assays$SCT@data) &
            hvg %in% rownames(hep_cells[["embolised"]]@assays$SCT@data) &
            hvg %in% rownames(hep_cells[["regenerating"]]@assays$SCT@data)]

for(cond in names(hep_cells)){
  print(cond)
  set.seed(1)
  # filtering the cells by bins
  bins = cut(traj_list[[cond]], 100)
  dons = hep_cells[[cond]]$Donor
  ndons = rowSums(table(bins, dons)>1)
  ncells = rowSums(table(bins, dons))
  sampleif = function(cells, bins, dons){
    res = c()
    for(d in unique(dons)){
      for(b in unique(bins)){
        x = cells[bins==b & dons==d]
        if(length(x)>=30){
          res = c(res, sample(x, 30, replace = F))
        } else if(length(x)>=1 & ndons[b]>=2 & ncells[b]>=10){
          res = c(res, x)
        } else{
          res = c(res, NULL)
        }
      }
    }
    return(res)
  }
  whichcells = sampleif(1:length(bins), bins, dons)
  
  # Fit GAM for each gene using pseudotime as independent variable.
  t = traj_list[[cond]][whichcells]
  gene_fit_p = c()
  gene_fit_vals = data.frame(row.names = 1:100)
  for(i in 1:length(hvg)){
    g = hvg[i]
    z = hep_cells[[cond]]@assays$SCT@data[g,whichcells]
    
    d = data.frame(z=z, t=t)
    tmp = suppressMessages(gam(z ~ ns(t, df = 3), data=d))
    
    # bins for model fitting
    bb = seq(min(t), max(t), length.out = 100)
    gene_fit_vals[,g] = suppressMessages(predict(tmp,
                                                 newdata = data.frame(t = bb)))
    p = summary(tmp)$parametric.anova$`Pr(>F)`[1]
    gene_fit_p = c(gene_fit_p, p)
    names(gene_fit_p)[length(gene_fit_p)] = g
  }
  
  if(sum(is.na(gene_fit_p))>0){
    gene_fit_p[is.na(gene_fit_p)] = 1
  }
  gene_fit_p = fdrtool::fdrtool(gene_fit_p, statistic="pvalue", plot=F, 
                                verbose=F, cutoff.method="pct0", pct0=0.9)$qval
  
  qvals_b_list[[cond]] = gene_fit_p
  fits_b_list[[cond]] = gene_fit_vals
}
```

Save trajectories and genes

```{r}
save(fits_b_list, qvals_b_list, traj_list, 
     file = "results/zonation_cond/hep_binned_traj_qval_fits_rank.RData")
```

Find varying genes - using each donor as a replicate and calculating the average expression for each of 100 bins

```{r}
qvals_db_list = list()
fits_db_list = list()

# get HVG from all conditions
hvg = unique(c(hep_cells[["healthy"]]@assays$SCT@var.features,
               hep_cells[["embolised"]]@assays$SCT@var.features,
               hep_cells[["regenerating"]]@assays$SCT@var.features))
hvg = hvg[hvg %in% rownames(hep_cells[["healthy"]]@assays$SCT@data) &
            hvg %in% rownames(hep_cells[["embolised"]]@assays$SCT@data) &
            hvg %in% rownames(hep_cells[["regenerating"]]@assays$SCT@data)]

meta_df = data.frame("pt" = c(traj_list$healthy, traj_list$embolised, traj_list$regenerating),
                     "donors" = c(hep_cells$healthy$Donor, hep_cells$embolised$Donor,
                                  hep_cells$regenerating$Donor),
                     "cond" = c(hep_cells$healthy$Condition, hep_cells$embolised$Condition,
                                  hep_cells$regenerating$Condition))
meta_df$bins = cut(meta_df$pt, 100)

for(cond in unique(meta_df$cond)){
  print(cond)
  set.seed(1)
  
  # Fit GAM for each gene
  t = 1:100
  sub_meta_df = meta_df[meta_df$cond==cond,]
  gene_fit_p = c()
  gene_fit_vals = data.frame(row.names = 1:100)
  for(i in 1:length(hvg)){
    g = hvg[i]
    z = hep_cells[[cond]]@assays$SCT@data[g,]
    
    d = aggregate(z~bins+donors, data = cbind(sub_meta_df, z), FUN="mean")
    cc = data.frame(table(sub_meta_df$bins, as.factor(as.character(sub_meta_df$donors))))
    cc = cc[cc$Freq==0,]
    colnames(cc) = colnames(d)
    d = rbind(d, cc)
    d = d[order(d$bins, decreasing = F),]
    d$t = rep(t, each = length(unique(d$donors)))
    tmp = suppressMessages(gam(z ~ ns(t, df = 3), data=d))
    
    # bins for model fitting
    bb = seq(min(t), max(t), length.out = 100)
    gene_fit_vals[,g] = suppressMessages(predict(tmp,
                                                 newdata = data.frame(t = bb)))
    p = summary(tmp)$parametric.anova$`Pr(>F)`[1]
    gene_fit_p = c(gene_fit_p, p)
    names(gene_fit_p)[length(gene_fit_p)] = g
  }
  
  if(sum(is.na(gene_fit_p))>0){
    gene_fit_p[is.na(gene_fit_p)] = 1
  }
  gene_fit_p = fdrtool::fdrtool(gene_fit_p, statistic="pvalue", plot=F, 
                                verbose=F, cutoff.method="pct0", pct0=0.9)$qval
  
  qvals_db_list[[cond]] = gene_fit_p
  fits_db_list[[cond]] = gene_fit_vals
}
```

Save trajectories and genes

```{r}
save(fits_db_list, qvals_db_list, traj_list, 
     file = "results/zonation_cond/hep_binnedDonor_traj_qval_fits_rank.RData")
```

Plot cell distributions in pseudotime

```{r}
plot_df = data.frame("pseudotime" = c(traj_list$healthy, traj_list$embolised,
                                      traj_list$regenerating),
           "donor" = c(hep_cells$healthy@meta.data[,"Donor"],
                       hep_cells$embolised@meta.data[names(traj_list$embolised),"Donor"],
                       hep_cells$regenerating@meta.data[names(traj_list$regenerating),"Donor"]),
           "cond" = c(rep("healthy", length(traj_list$healthy)), 
                      rep("embolised", length(traj_list$embolised)),
                      rep("regenerating", length(traj_list$regenerating))))

pdf("results/zonation_cond/hep_cell_distributions_zonation.pdf", useDingbats = F, 
    width = 6, height = 5)
ggplot(plot_df, aes(x = donor, y = pseudotime, fill = cond))+
  geom_boxplot(notch = T)+
  theme_bw()

ggplot(plot_df, aes(x = pseudotime, group = donor, colour = donor))+
  facet_wrap(~cond)+
  geom_density()+
  theme_bw()

ggplot(plot_df, aes(x = pseudotime, colour = cond))+
  geom_density()+
  theme_bw()
dev.off()
```

Do cross validation on the healthy trajectory

```{r}
sig_genes = qvals_list$healthy[qvals_list$healthy<=0.05]
top_sig_genes = names(sig_genes[order(sig_genes, decreasing = F)][1:500]) 

data_gam_h = cbind(data.frame("DC1" = traj_list$healthy),
                   Matrix::t(hep_cells$health@assays$SCT@data[top_sig_genes,]))
colnames(data_gam_h) = gsub("-", "_", colnames(data_gam_h), fixed = T)
colnames(data_gam_h) = gsub(".", "_", colnames(data_gam_h), fixed = T)

cv_gam = CVgam(formula = DC1 ~ ., data = data_gam_h, nfold = 10, seed = 1, method = "glm.fit")

pdf("results/zonation_healthy/hep_original_fitted_pseudotime.pdf", useDingbats = F, 
    width = 6, height = 5)
plot(data_gam_h$DC1, cv_gam$fitted, xlab = "Healthy pseudotime (DC1)",ylab = "Predicted Healthy",
     main = paste0("Healthy original vs fitted pseudotime\nMSE: ", round(cv_gam$cvscale, 6)),
     pch = 19, cex = 0.5)
abline(0,1, col = "blue", lwd = 3)
dev.off()
```

Save zonation results

```{r}
# define the intervals based on healthy
df_h = data.table("zonation_pt" = traj_list$healthy)
df_h$zonation_int = cut(df_h$zonation_pt, 3)

df_e = data.table("zonation_pt" = traj_list$embolised)
df_e = df_h[df_e, on="zonation_pt", roll=Inf, rollends = T]

df_r = data.table("zonation_pt" = traj_list$regenerating)
df_r = df_h[df_r, on="zonation_pt", roll=Inf, rollends = T]

list_df = list("healthy" = df_h, "embolised" = df_e, "regenerating" = df_r)

for(n in names(traj_list)){
  ptdf = list_df[[n]]
  rownames(ptdf) = colnames(hep_cells[[n]])
  hep_cells[[n]] = AddMetaData(hep_cells[[n]], metadata = ptdf)
  write.csv(hep_cells[[n]]@meta.data[,c("unique_name", "Condition", 
                                        "zonation_pt", "zonation_int")],
            file = paste0("results/zonation_cond/hep_zonation_", n, "_rank.csv"),
            col.names = T, row.names = T, quote = F)
}

saveRDS(hep_cells, file = "results/zonation_cond/hep_cells_zonation_rank.RDS")
```



# Endothelial cells
Subset and process each condition (only LSEC)

```{r}
end_cells = list()
for(cond in unique(allcells_css@meta.data$Condition)){
  cat(cond)
  end_cells[[cond]] = allcells_css[,grepl("LSEC ", allcells_css@meta.data$allcells_clusters) &
                                     allcells_css@meta.data$Condition==cond]
  if(cond=="healthy"){
    end_cells[[cond]] = end_cells[[cond]][,grepl("LSEC ", end_cells[[cond]]@meta.data$names_clusters)]
  }
  print(dim(end_cells[[cond]]))
  end_cells[[cond]] = suppressWarnings(SCTransform(end_cells[[cond]], do.correct.umi = T, 
                                                   verbose = F, 
                                                   vars.to.regress=c("unique_name","nCount_RNA"),
                                                   variable.features.rv.th = 1, seed.use = 1,
                                                   return.only.var.genes = F, 
                                                   variable.features.n = NULL))
  end_cells[[cond]] = RunPCA(end_cells[[cond]], verbose = F)
  end_cells[[cond]] = RunUMAP(end_cells[[cond]], dims = 1:20, verbose = F)
}
```


## Healthy analysis
For healthy cells (which will serve as reference), do additional filtering and reprocessing

```{r}
DimPlot(end_cells$healthy, reduction = "umap", group.by = "allcells_clusters")
gfresh = c("CLEC1B", "CLEC4G", "LYVE1", "CD14",  # central 
           "PECAM1", "AQP1", "VWF", "CD34", # portal
           "ALB", "MKI67", "TRAC", "DCN") 
FeaturePlot(end_cells$healthy, features = gfresh, reduction = "umap")

outcells = (end_cells$healthy@reductions$umap@cell.embeddings[,1]<(-4)) # this removes what seem to be central hepatocytes
end_cells$healthy = end_cells$healthy[,!outcells]

end_cells$healthy = suppressWarnings(SCTransform(end_cells$healthy, do.correct.umi = T, 
                                                 verbose = F,
                                                 vars.to.regress = c("unique_name","nCount_RNA"),
                                                 variable.features.rv.th = 1, seed.use = 1,
                                                 return.only.var.genes = F, 
                                                 variable.features.n = NULL))
end_cells$healthy = RunPCA(end_cells$healthy, verbose = F)
end_cells$healthy = RunUMAP(end_cells$healthy, dims = 1:15, verbose = F)
DimPlot(end_cells$healthy, reduction = "umap", group.by = "unique_name")

end_cells$healthy = FindNeighbors(end_cells$healthy, reduction = "pca", dims = 1:15,
                                  force.recalc = T, graph.name = "end")
end_cells$healthy = FindClusters(end_cells$healthy, graph.name = "end", algorithm = 2, 
                                 resolution = seq(0.1,0.8,0.1), verbose = F)
DimPlot(end_cells$healthy, reduction = "umap", group.by = "end_res.0.2")
end_cells$healthy = SetIdent(end_cells$healthy, value = "end_res.0.2")
mk_end_h = FindAllMarkers(end_cells$healthy, assay = "SCT")

saveRDS(end_cells$healthy, file = "results/zonation_healthy/end_healthy_srat.RDS")
```

Save endothelial cells

```{r}
saveRDS(end_cells, file="results/zonation_cond/end_cells.RDS")
```

Plot expression of zonation markers

```{r}
gfresh = c("CLEC1B", "CLEC4G", "LYVE1", "CD14",  # central 
           "PECAM1", "AQP1", "VWF", "CD34") # portal

png("results/zonation_healthy/healthy_hepatocyte_zonation_markers_violin.png", 
    height = 1400, width = 1400)
VlnPlot(end_cells$healthy, features = gfresh, group.by = "names_clusters", 
        pt.size = 0, sort = "increasing")
dev.off()
png("results/zonation_healthy/healthy_hepatocyte_zonation_markers_umap.png", 
    height = 1400, width = 1400)
FeaturePlot(end_cells$healthy, features = gfresh, pt.size = 0.6)
dev.off()
```

Obtain Diffusion maps projection

```{r}
set.seed(1)
dm = DiffusionMap(end_cells$healthy@reductions$pca@cell.embeddings[,1:4], 
                  rotate = T, n_eigs = 5)
dpt = DPT(dm, tips = c(2382, 1379, 424))
plot(dpt, col_by = "DPT424")
plot(dpt, col_by = "branch")

dm = DiffusionMap(end_cells$healthy@reductions$pca@cell.embeddings[dpt$DPT424<15,1:4], 
                  rotate = T, n_eigs = 5)
dpt = DPT(dm)
plot(dpt, col_by = "DPT363")

end_cells$healthy = end_cells$healthy[,names(dpt@dm$DC1)]
```

Check expression of some genes

```{r}
df_dc = cbind(end_cells$healthy@meta.data, 
              data.frame("DC1" = dpt@dm$DC1, "DC2" = dpt@dm$DC2, "DPT" = dpt$DPT363),
              Matrix::t(end_cells$healthy@assays$SCT@data[gfresh,]))
colnames(df_dc) = gsub("-", ".", colnames(df_dc), fixed = T)
df_dc = cbind(df_dc, end_cells$healthy@reductions$umap@cell.embeddings)

plt_list = list()
for(n in c(gsub("-", ".", gfresh, fixed = T), "names_clusters", "unique_name")){
  plt_list[[n]] = ggplot(df_dc, aes_string(x = "DC1", y = "DC2", colour = n))+
    geom_point()+
    theme_classic()
}

png("results/zonation_healthy/healthy_endothelial_zonation_markers_dc.png", 
    height = 900, width = 2800)
cowplot::plot_grid(plotlist = plt_list, nrow = 4, ncol = 3, align = "hv")
dev.off()
```

We'll be using DPT363 as the pseudotime. This was the best approximation possible to the zonation gradient based on diffusion maps

```{r}
pdf("results/zonation_healthy/end_distributions_DPT363.pdf", useDingbats = F, 
    height = 4, width = 9)
plt1 = ggplot(df_dc, aes(x = DPT, y = names_clusters, 
                  group = names_clusters, fill = names_clusters))+
  geom_density_ridges2(alpha = 0.4)+
  theme_bw()+
  theme(legend.position = "none")

plt2 = ggplot(df_dc, aes(x = DPT, y = unique_name, 
                  group = unique_name, fill = unique_name))+
  geom_density_ridges2(alpha = 0.4)+
  theme_bw()+
  theme(legend.position = "none")

cowplot::plot_grid(plt1, plt2, ncol = 2, align = "h")
dev.off()
```

Find genes varying along the pseudotime

```{r}
# will only use variable genes
hvg = end_cells$healthy@assays$SCT@var.features

# Fit GAM for each gene using pseudotime as independent variable.
t = scales::rescale(rank(-df_dc$DPT), c(0.00001, 0.99999))
gene_fit_p = c()
gene_fit_vals = data.frame(row.names = 1:100)
for(i in 1:length(hvg)){
  g = hvg[i]
  z = end_cells$healthy@assays$SCT@data[g,]
  
  d = data.frame(z=z, t=t)
  tmp = suppressMessages(gam(z ~ ns(t, df = 3), data=d))
  
  # bins for model fitting
  bb = seq(min(t), max(t), length.out = 100)
  gene_fit_vals[,g] = suppressMessages(predict(tmp,
                                               newdata = data.frame(t = bb)))
  p = summary(tmp)$parametric.anova$`Pr(>F)`[1]
  gene_fit_p = c(gene_fit_p, p)
  names(gene_fit_p)[length(gene_fit_p)] = g
}

gene_fit_p = fdrtool::fdrtool(gene_fit_p, statistic="pvalue", plot=F, 
                              verbose=F, cutoff.method="pct0", pct0=0.9)$qval
```


## Analysis of other conditions
First, clean the data from the other two conditions

```{r}
end_cells$embolised = FindNeighbors(end_cells$embolised, reduction = "pca", dims = 1:20,
                                    force.recalc = T, graph.name = "endo")
end_cells$embolised = FindClusters(end_cells$embolised, graph.name = "endo", algorithm = 2, 
                                   resolution = 0.8, verbose = F)
DimPlot(end_cells$embolised, reduction = "umap", group.by = "endo_res.0.8")
end_cells$embolised = SetIdent(end_cells$embolised, value = "endo_res.0.8")
mk_emb = FindAllMarkers(end_cells$embolised, assay = "SCT")

outcells = (end_cells$embolised@reductions$umap@cell.embeddings[,2]>2.3 & 
              end_cells$embolised@reductions$umap@cell.embeddings[,1]<1) # this removes what seem to be central hepatocytes
end_cells$embolised = end_cells$embolised[,!outcells]
end_cells$embolised = end_cells$embolised[,end_cells$embolised@meta.data[,"endo_res.0.8"]!=12] # remove these extra stellate cells

end_cells$embolised = suppressWarnings(SCTransform(end_cells$embolised, do.correct.umi = T, 
                                                 verbose = F,
                                                 vars.to.regress = c("unique_name","nCount_RNA"),
                                                 variable.features.rv.th = 1, seed.use = 1,
                                                 return.only.var.genes = F, 
                                                 variable.features.n = NULL))
end_cells$embolised = RunPCA(end_cells$embolised, verbose = F)
end_cells$embolised = RunUMAP(end_cells$embolised, dims = 1:15, verbose = F)
DimPlot(end_cells$embolised, reduction = "umap", group.by = "unique_name")


outcells = (end_cells$regenerating@reductions$umap@cell.embeddings[,2]>2.7 & 
              end_cells$regenerating@reductions$umap@cell.embeddings[,1]<(-1)) # this removes what seem to be central hepatocytes
end_cells$regenerating = end_cells$regenerating[,!outcells]

end_cells$regenerating = suppressWarnings(SCTransform(end_cells$regenerating, do.correct.umi = T, 
                                                 verbose = F,
                                                 vars.to.regress = c("unique_name","nCount_RNA"),
                                                 variable.features.rv.th = 1, seed.use = 1,
                                                 return.only.var.genes = F, 
                                                 variable.features.n = NULL))
end_cells$regenerating = RunPCA(end_cells$regenerating, verbose = F)
end_cells$regenerating = RunUMAP(end_cells$regenerating, dims = 1:15, verbose = F)
DimPlot(end_cells$regenerating, reduction = "umap", group.by = "unique_name")
```

Now use the top genes to learn the pseudozonation coordinates and project the embolised and regenerating data

```{r}
sig_genes = gene_fit_p[gene_fit_p<=0.05]
top_sig_genes = names(sig_genes[order(sig_genes, decreasing = F)][1:1000])
top_sig_genes = top_sig_genes[top_sig_genes %in% rownames(end_cells$embolised@assays$SCT@data) &
                                top_sig_genes %in%  rownames(end_cells$regenerating@assays$SCT@data)]

data_gam_h = cbind(data.frame("DPT" = df_dc$DPT),
                   Matrix::t(end_cells$healthy@assays$SCT@data[top_sig_genes,]))
data_gam_h$DPT = scales::rescale(rank(-data_gam_h$DPT), c(0.00001, 0.99999))

gam_healthy = gam::gam(DPT ~ ., data = data_gam_h, family = mgcv::betar(link = "logit", eps = 0.00001))

hea_pred = predict(gam_healthy,
                   Matrix::t(end_cells$health@assays$SCT@data[top_sig_genes,]), type= "response")
reg_pred = predict(gam_healthy,
                   Matrix::t(end_cells$regenerating@assays$SCT@data[top_sig_genes,]), type= "response")
emb_pred = predict(gam_healthy,
                   Matrix::t(end_cells$embolised@assays$SCT@data[top_sig_genes,]), type= "response")

plot(density(data_gam_h$DPT), col = "green", ylim = c(0,2.75))
lines(density(reg_pred), col = "blue")
lines(density(emb_pred), col = "red")
lines(density(hea_pred), col = "grey50")
legend(-0.073,32, legend = c("healthy (DC1)", "healthy (pred)", 
                         "embolised (pred)", "regenerating (pred)"),
       fill = c("green", "grey50", "red", "blue"))

plot_df = data.frame(vals = c(data_gam_h$DPT, emb_pred, reg_pred), 
                     Condition = c(rep("healthy", length(data_gam_h$DPT)), 
                                   rep("embolised", length(emb_pred)), 
                                   rep("regenerating", length(reg_pred))),
                     Donor = c(end_cells$healthy$Donor, 
                               end_cells$embolised$Donor, end_cells$regenerating$Donor),
                     ct = c(end_cells$healthy$allcells_simp, 
                               end_cells$embolised$allcells_simp, end_cells$regenerating$allcells_simp))
plot_df$bins100 = cut(plot_df$vals, 10)
plot_df$Condition = factor(plot_df$Condition, levels = c("healthy", "embolised", "regenerating"))
ggplot(plot_df, aes(x = bins100, fill = Condition))+
     geom_bar(position = "fill")+
     labs(x = "zonation (binned)", y = "proportion")+
     scale_y_continuous(expand = c(0,0))+
     scale_fill_manual(values = colcond)+
     guides(fill = guide_legend(title.position = "top"))+
     theme(axis.text.x = element_blank(),
           legend.title.align = 0,
           legend.position = "bottom")
ggplot(plot_df, aes(x = bins100, fill = Condition))+
     facet_wrap(~Condition)+
     geom_bar()+
     labs(x = "zonation (binned)", y = "Number of cells")+
     scale_y_continuous(expand = c(0,0))+
     scale_fill_manual(values = colcond)+
     coord_flip()+
     guides(fill = guide_legend(title.position = "top"))+
     theme(axis.text.y = element_blank(),
           legend.title.align = 0,
           legend.position = "bottom")
ggplot(plot_df, aes(x = vals, colour = Donor))+
     facet_wrap(~Condition)+
     geom_density()+
     labs(x = "zonation (binned)", y = "proportion")+
     theme_bw()+
     theme(legend.title.align = 0,
           legend.position = "bottom")
ggplot(plot_df, aes(x = vals, colour = ct))+
     facet_wrap(~Condition)+
     geom_density()+
     labs(x = "zonation (binned)", y = "proportion")+
     theme_bw()+
     theme(legend.title.align = 0,
           legend.position = "bottom")

reg_df = data.frame("pt" = reg_pred, "don" = end_cells$regenerating$Donor)
reg_df = cbind(reg_df, Matrix::t(end_cells$regenerating@assays$SCT@data[gfresh,]))
ggplot(reg_df, aes(x = pt, y = CLEC1B, colour = don))+
  geom_point()+
  theme_classic()+
  facet_wrap(~don)+
  geom_hline(yintercept = 3)+
  geom_vline(xintercept = c(0,1))

ggplot(reg_df, aes(x = CLEC1B, y = LYVE1, colour = don))+
  geom_point()+
  theme_classic()+
  facet_wrap(~don)

traj_list = list("healthy" = data_gam_h$DPT, "embolised" = emb_pred, "regenerating" = reg_pred)
```

Calculate correlations

```{r}
cor_h = t(cor(data_gam_h$DPT, as.matrix(Matrix::t(end_cells$healthy@assays$SCT@data)),
              method = "sp"))
cor_emb = t(cor(emb_pred, as.matrix(Matrix::t(end_cells$embolised@assays$SCT@data)),
                method = "sp"))
cor_reg = t(cor(reg_pred, as.matrix(Matrix::t(end_cells$regenerating@assays$SCT@data)),
                method = "sp"))

l = list(cor_h, cor_emb, cor_reg)
all_corr = Reduce(function(x, y) merge(x,y, by = "rn", all = T), 
                  lapply(l, function(x) data.frame(x, rn = row.names(x))))
colnames(all_corr) = c("genes", "healthy", "embolised", "regenerating")

write.csv(all_corr, file = paste0("results/zonation_cond/end_correlations_zonation_rank.csv"),
          col.names = T, row.names = F, quote = F)
```

Find varying genes - using all cells along the ranked pseudotime

```{r}
qvals_list = list()
fits_list = list()

# get HVG from all conditions
hvg = unique(c(end_cells[["healthy"]]@assays$SCT@var.features,
               end_cells[["embolised"]]@assays$SCT@var.features,
               end_cells[["regenerating"]]@assays$SCT@var.features))
hvg = hvg[hvg %in% rownames(end_cells[["healthy"]]@assays$SCT@data) &
            hvg %in% rownames(end_cells[["embolised"]]@assays$SCT@data) &
            hvg %in% rownames(end_cells[["regenerating"]]@assays$SCT@data)]

for(cond in names(end_cells)){
  print(cond)
  # Fit GAM for each gene using pseudotime as independent variable.
  t = traj_list[[cond]]
  gene_fit_p = c()
  gene_fit_vals = data.frame(row.names = 1:100)
  for(i in 1:length(hvg)){
    g = hvg[i]
    z = end_cells[[cond]]@assays$SCT@data[g,]
    
    d = data.frame(z=z, t=t)
    tmp = suppressMessages(gam(z ~ ns(t, df = 4), data=d))
    
    # bins for model fitting
    bb = seq(min(t), max(t), length.out = 100)
    gene_fit_vals[,g] = suppressMessages(predict(tmp,
                                                 newdata = data.frame(t = bb)))
    p = summary(tmp)$parametric.anova$`Pr(>F)`[1]
    gene_fit_p = c(gene_fit_p, p)
    names(gene_fit_p)[length(gene_fit_p)] = g
  }
  
  if(sum(is.na(gene_fit_p))>0){
    gene_fit_p[is.na(gene_fit_p)] = 1
  }
  gene_fit_p = fdrtool::fdrtool(gene_fit_p, statistic="pvalue", plot=F, 
                                verbose=F, cutoff.method="pct0", pct0=0.9)$qval
  
  qvals_list[[cond]] = gene_fit_p
  fits_list[[cond]] = gene_fit_vals
}
```

Save trajectories and genes

```{r}
save(fits_list, qvals_list, traj_list, 
     file = "results/zonation_cond/end_traj_qval_fits_rank.RData")
```

Find varying genes - normalised by bins along the ranked pseudotime

```{r}
qvals_b_list = list()
fits_b_list = list()

# get HVG from all conditions
hvg = unique(c(end_cells[["healthy"]]@assays$SCT@var.features,
               end_cells[["embolised"]]@assays$SCT@var.features,
               end_cells[["regenerating"]]@assays$SCT@var.features))
hvg = hvg[hvg %in% rownames(end_cells[["healthy"]]@assays$SCT@data) &
            hvg %in% rownames(end_cells[["embolised"]]@assays$SCT@data) &
            hvg %in% rownames(end_cells[["regenerating"]]@assays$SCT@data)]

for(cond in names(end_cells)){
  print(cond)
  set.seed(1)
  # filtering the cells by bins
  bins = cut(traj_list[[cond]], 100)
  dons = end_cells[[cond]]$Donor
  ndons = rowSums(table(bins, dons)>1)
  ncells = rowSums(table(bins, dons))
  sampleif = function(cells, bins, dons){
    res = c()
    for(d in unique(dons)){
      for(b in unique(bins)){
        x = cells[bins==b & dons==d]
        if(length(x)>=30){
          res = c(res, sample(x, 30, replace = F))
        } else if(length(x)>=1 & ndons[b]>=2 & ncells[b]>=10){
          res = c(res, x)
        } else{
          res = c(res, NULL)
        }
      }
    }
    return(res)
  }
  whichcells = sampleif(1:length(bins), bins, dons)
  
  # Fit GAM for each gene using pseudotime as independent variable.
  t = traj_list[[cond]][whichcells]
  gene_fit_p = c()
  gene_fit_vals = data.frame(row.names = 1:100)
  for(i in 1:length(hvg)){
    g = hvg[i]
    z = end_cells[[cond]]@assays$SCT@data[g,whichcells]
    
    d = data.frame(z=z, t=t)
    tmp = suppressMessages(gam(z ~ ns(t, df = 3), data=d))
    
    # bins for model fitting
    bb = seq(min(t), max(t), length.out = 100)
    gene_fit_vals[,g] = suppressMessages(predict(tmp,
                                                 newdata = data.frame(t = bb)))
    p = summary(tmp)$parametric.anova$`Pr(>F)`[1]
    gene_fit_p = c(gene_fit_p, p)
    names(gene_fit_p)[length(gene_fit_p)] = g
  }
  
  if(sum(is.na(gene_fit_p))>0){
    gene_fit_p[is.na(gene_fit_p)] = 1
  }
  gene_fit_p = fdrtool::fdrtool(gene_fit_p, statistic="pvalue", plot=F, 
                                verbose=F, cutoff.method="pct0", pct0=0.9)$qval
  
  qvals_b_list[[cond]] = gene_fit_p
  fits_b_list[[cond]] = gene_fit_vals
}
```

Save trajectories and genes

```{r}
save(fits_b_list, qvals_b_list, traj_list, 
     file = "results/zonation_cond/end_binned_traj_qval_fits_rank.RData")
```

Find varying genes - using each donor as a replicate and calculating the average expression for each of 100 bins

```{r}
qvals_db_list = list()
fits_db_list = list()

# get HVG from all conditions
hvg = unique(c(end_cells[["healthy"]]@assays$SCT@var.features,
               end_cells[["embolised"]]@assays$SCT@var.features,
               end_cells[["regenerating"]]@assays$SCT@var.features))
hvg = hvg[hvg %in% rownames(end_cells[["healthy"]]@assays$SCT@data) &
            hvg %in% rownames(end_cells[["embolised"]]@assays$SCT@data) &
            hvg %in% rownames(end_cells[["regenerating"]]@assays$SCT@data)]

meta_df = data.frame("pt" = c(traj_list$healthy, traj_list$embolised, traj_list$regenerating),
                     "donors" = c(end_cells$healthy$Donor, end_cells$embolised$Donor,
                                  end_cells$regenerating$Donor),
                     "cond" = c(end_cells$healthy$Condition, end_cells$embolised$Condition,
                                  end_cells$regenerating$Condition))
meta_df$bins = cut(meta_df$pt, 100)

for(cond in unique(meta_df$cond)){
  print(cond)
  set.seed(1)
  
  # Fit GAM for each gene
  t = 1:100
  sub_meta_df = meta_df[meta_df$cond==cond,]
  gene_fit_p = c()
  gene_fit_vals = data.frame(row.names = 1:100)
  for(i in 1:length(hvg)){
    g = hvg[i]
    z = end_cells[[cond]]@assays$SCT@data[g,]
    
    d = aggregate(z~bins+donors, data = cbind(sub_meta_df, z), FUN="mean")
    cc = data.frame(table(sub_meta_df$bins, as.factor(as.character(sub_meta_df$donors))))
    cc = cc[cc$Freq==0,]
    colnames(cc) = colnames(d)
    d = rbind(d, cc)
    d = d[order(d$bins, decreasing = F),]
    d$t = rep(t, each = length(unique(d$donors)))
    tmp = suppressMessages(gam(z ~ ns(t, df = 3), data=d))
    
    # bins for model fitting
    bb = seq(min(t), max(t), length.out = 100)
    gene_fit_vals[,g] = suppressMessages(predict(tmp,
                                                 newdata = data.frame(t = bb)))
    p = summary(tmp)$parametric.anova$`Pr(>F)`[1]
    gene_fit_p = c(gene_fit_p, p)
    names(gene_fit_p)[length(gene_fit_p)] = g
  }
  
  if(sum(is.na(gene_fit_p))>0){
    gene_fit_p[is.na(gene_fit_p)] = 1
  }
  gene_fit_p = fdrtool::fdrtool(gene_fit_p, statistic="pvalue", plot=F, 
                                verbose=F, cutoff.method="pct0", pct0=0.9)$qval
  
  qvals_db_list[[cond]] = gene_fit_p
  fits_db_list[[cond]] = gene_fit_vals
}
```

Save trajectories and genes

```{r}
save(fits_db_list, qvals_db_list, traj_list, 
     file = "results/zonation_cond/end_binnedDonor_traj_qval_fits_rank.RData")
```

Plot cell distributions in pseudotime

```{r}
plot_df = data.frame("pseudotime" = c(traj_list$healthy, traj_list$embolised,
                                      traj_list$regenerating),
           "donor" = c(end_cells$healthy@meta.data[,"Donor"],
                       end_cells$embolised@meta.data[names(traj_list$embolised),"Donor"],
                       end_cells$regenerating@meta.data[names(traj_list$regenerating),"Donor"]),
           "cond" = c(rep("healthy", length(traj_list$healthy)), 
                      rep("embolised", length(traj_list$embolised)),
                      rep("regenerating", length(traj_list$regenerating))))

pdf("results/zonation_cond/end_cell_distributions_zonation.pdf", useDingbats = F, 
    width = 6, height = 5)
ggplot(plot_df, aes(x = donor, y = pseudotime, fill = cond))+
  geom_boxplot(notch = T)+
  theme_bw()

ggplot(plot_df, aes(x = pseudotime, group = donor, colour = donor))+
  facet_wrap(~cond)+
  geom_density()+
  theme_bw()

ggplot(plot_df, aes(x = pseudotime, colour = cond))+
  geom_density()+
  theme_bw()
dev.off()
```

Do cross validation on the healthy trajectory

```{r}
sig_genes = qvals_b_list$healthy[qvals_b_list$healthy<=0.05]
top_sig_genes = names(sig_genes[order(sig_genes, decreasing = F)][1:1000]) 
# 500 in this case actually seems to be one of the models with the lowest MSE

data_gam_h = cbind(data.frame("DPT" = traj_list$healthy),
                   Matrix::t(end_cells$healthy@assays$SCT@data[top_sig_genes,]))
colnames(data_gam_h) = gsub("-", "_", colnames(data_gam_h), fixed = T)
colnames(data_gam_h) = gsub(".", "_", colnames(data_gam_h), fixed = T)

cv_gam = CVgam(data = data_gam_h, formula = DPT ~ ., nfold = 10, seed = 1, method = "glm.fit")

pdf("results/zonation_healthy/end_original_fitted_pseudotime.pdf", useDingbats = F, 
    width = 6, height = 5)
plot(data_gam_h$DPT, cv_gam$fitted, xlab = "Healthy pseudotime (DPT363)",ylab = "Predicted Healthy",
     main = paste0("Healthy original vs fitted pseudotime\nMSE: ", round(cv_gam$cvscale, 6)),
     pch = 19, cex = 0.5)
abline(0,1, col = "blue", lwd = 3)
dev.off()
```

Save zonation results

```{r}
# define the intervals based on healthy
df_h = data.table("zonation_pt" = traj_list$healthy)
df_h$zonation_int = cut(df_h$zonation_pt, 3)

df_e = data.table("zonation_pt" = traj_list$embolised)
df_e = df_h[df_e, on="zonation_pt", roll=Inf, rollends = T]

df_r = data.table("zonation_pt" = traj_list$regenerating)
df_r = df_h[df_r, on="zonation_pt", roll=Inf, rollends = T]

list_df = list("healthy" = df_h, "embolised" = df_e, "regenerating" = df_r)

for(n in names(traj_list)){
  ptdf = list_df[[n]]
  rownames(ptdf) = colnames(end_cells[[n]])
  end_cells[[n]] = AddMetaData(end_cells[[n]], metadata = ptdf)
  write.csv(end_cells[[n]]@meta.data[,c("unique_name", "Condition", 
                                        "zonation_pt", "zonation_int")],
            file = paste0("results/zonation_cond/end_zonation_", n, "_rank.csv"),
            col.names = T, row.names = T, quote = F)
}

saveRDS(end_cells, file = "results/zonation_cond/end_cells_zonation_rank.RDS")
```



# Comparing genes between conditions
Load data

```{r}
end_cells = readRDS(file = "results/zonation_cond/end_cells_zonation_rank.RDS")
hep_cells = readRDS(file = "results/zonation_cond/hep_cells_zonation_rank.RDS")
```

Load fits

```{r}
load("results/zonation_cond/hep_binnedDonor_traj_qval_fits_rank.RData")
hep_fits_list = fits_db_list
hep_qvals_list = qvals_db_list
hep_traj = traj_list
load("results/zonation_cond/end_binnedDonor_traj_qval_fits_rank.RData")
end_fits_list = fits_db_list
end_qvals_list = qvals_db_list
end_traj = traj_list
```

Calculate expression per bins

```{r}
nbins = 100

dfhep_h = data.table("zonation_pt" = hep_traj$healthy)
dfhep_h$zonation_int = cut(dfhep_h$zonation_pt, nbins)
dfhep_e = data.table("zonation_pt" = hep_traj$embolised)
dfhep_e = dfhep_h[dfhep_e, on="zonation_pt", roll=Inf, rollends = T]
dfhep_r = data.table("zonation_pt" = hep_traj$regenerating)
dfhep_r = dfhep_h[dfhep_r, on="zonation_pt", roll=Inf, rollends = T]

hep_mean_bins = list(
healthy = apply(hep_cells$healthy@assays$SCT@data, 1, 
                        function(x) tapply(x, dfhep_h$zonation_int, mean)),
embolised = apply(hep_cells$embolised@assays$SCT@data, 1, 
                        function(x) tapply(x, dfhep_e$zonation_int, mean)),
regenerating = apply(hep_cells$regenerating@assays$SCT@data, 1, 
                        function(x) tapply(x, dfhep_r$zonation_int, mean))
)
for(n in names(hep_mean_bins)){ hep_mean_bins[[n]][is.na(hep_mean_bins[[n]])] = 0 }
saveRDS(hep_mean_bins, file = "results/zonation_cond/zonation_bins_hep_list_rank.RDS")

dfend_h = data.table("zonation_pt" = end_traj$healthy)
dfend_h$zonation_int = cut(dfend_h$zonation_pt, nbins)
dfend_e = data.table("zonation_pt" = end_traj$embolised)
dfend_e = dfend_h[dfend_e, on="zonation_pt", roll=Inf, rollends = T]
dfend_r = data.table("zonation_pt" = end_traj$regenerating)
dfend_r = dfend_h[dfend_r, on="zonation_pt", roll=Inf, rollends = T]

end_mean_bins = list(
healthy = apply(end_cells$healthy@assays$SCT@data, 1, 
                        function(x) tapply(x, dfend_h$zonation_int, mean)),
embolised = apply(end_cells$embolised@assays$SCT@data, 1, 
                        function(x) tapply(x, dfend_e$zonation_int, mean)),
regenerating = apply(end_cells$regenerating@assays$SCT@data, 1, 
                        function(x) tapply(x, dfend_r$zonation_int, mean))
)
for(n in names(end_mean_bins)){ end_mean_bins[[n]][is.na(end_mean_bins[[n]])] = 0 }
saveRDS(end_mean_bins, file = "results/zonation_cond/zonation_bins_end_list_rank.RDS")
```

Correlate binned expression

```{r}
comb_cond = combn(names(end_cells), 2)
hep_cor_list = list()
for(i in 1:ncol(comb_cond)){
  c1 = comb_cond[1,i]
  c2 = comb_cond[2,i]
  
  genes = intersect(colnames(hep_mean_bins[[c1]]),colnames(hep_mean_bins[[c2]]))
  
  for(g in genes){
    hep_cor_list[[paste0(c1,"_",c2)]][[g]] = cor(cbind(hep_mean_bins[[c1]][,g],
                                                       hep_mean_bins[[c2]][,g]),
                                                 method = "sp")[1,2]
  }
}
hep_cor_list = lapply(hep_cor_list, data.frame)
hep_cor_list = lapply(hep_cor_list, function(x) cbind(rownames(x), x))

end_cor_list = list()
for(i in 1:ncol(comb_cond)){
  c1 = comb_cond[1,i]
  c2 = comb_cond[2,i]
  
  genes = intersect(colnames(end_mean_bins[[c1]]),colnames(end_mean_bins[[c2]]))
  
  for(g in genes){
    end_cor_list[[paste0(c1,"_",c2)]][[g]] = cor(cbind(end_mean_bins[[c1]][,g],
                                                       end_mean_bins[[c2]][,g]),
                                                 method = "sp")[1,2]
  }
}
end_cor_list = lapply(end_cor_list, data.frame)
end_cor_list = lapply(end_cor_list, function(x) cbind(rownames(x), x))
```

Correlate fitted expression

```{r}
comb_cond = combn(names(end_cells), 2)
hep_cor_fit_list = list()
for(i in 1:ncol(comb_cond)){
  c1 = comb_cond[1,i]
  c2 = comb_cond[2,i]
  
  genes = intersect(colnames(hep_fits_list[[c1]]),colnames(hep_fits_list[[c2]]))
  
  for(g in genes){
    hep_cor_fit_list[[paste0(c1,"_",c2)]][[g]] = cor(cbind(hep_fits_list[[c1]][,g],
                                                       hep_fits_list[[c2]][,g]),
                                                 method = "sp")[1,2]
  }
}
hep_cor_fit_list = lapply(hep_cor_fit_list, data.frame)
hep_cor_fit_list = lapply(hep_cor_fit_list, function(x) cbind(rownames(x), x))

end_cor_fit_list = list()
for(i in 1:ncol(comb_cond)){
  c1 = comb_cond[1,i]
  c2 = comb_cond[2,i]
  
  genes = intersect(colnames(end_fits_list[[c1]]),colnames(end_fits_list[[c2]]))
  
  for(g in genes){
    end_cor_fit_list[[paste0(c1,"_",c2)]][[g]] = cor(cbind(end_fits_list[[c1]][,g],
                                                       end_fits_list[[c2]][,g]),
                                                 method = "sp")[1,2]
  }
}
end_cor_fit_list = lapply(end_cor_fit_list, data.frame)
end_cor_fit_list = lapply(end_cor_fit_list, function(x) cbind(rownames(x), x))
```

Save correlations

```{r}
save(hep_cor_list, end_cor_list, 
     file = "results/zonation_cond/zonation_cor_cond_lists_rank.RData")
```

Filter pathways with ribosomal genes

```{r}
rpfilter = function(x){
  return(!grepl("^RPL", x$geneID) & !grepl("^RPS", x$geneID) & 
           !grepl("/RPL", x$geneID, fixed = T) & !grepl("/RPS", x$geneID, fixed = T))
}
```


## Hepatocytes
Get the top genes for each condition

```{r}
# get signifficant genes showing variation (at least 0.1 range in the fitted values)
genes_list = list()
for(n in names(hep_fits_list)){
  h_fc_sp = apply(hep_fits_list[[n]], 2, function(x) diff(range(x)))
  genes_c = names(hep_qvals_list[[n]])[hep_qvals_list[[n]]<=0.05 &
                    names(hep_qvals_list[[n]]) %in% names(h_fc_sp)[h_fc_sp>=0.1]]
  genes_c = genes_c[genes_c %in% names(hep_qvals_list[[n]])[order(hep_qvals_list[[n]], decreasing = F)]]
  genes_list[[n]] = genes_c
}
```

Find enriched GO Terms and Reactome pathways

```{r}
comb_cond = combn(names(hep_fits_list), 2)
all_genes_conds = list()
terms_groups_list = list()
df_cor_list = list()
for(i in 1:ncol(comb_cond)){
  comp = paste0(comb_cond[1,i],"_", comb_cond[2,i])
  print(comb_cond[,i])
  genes_conds = list("common" = intersect(genes_list[[comb_cond[1,i]]], genes_list[[comb_cond[2,i]]]))
  genes_conds[[comb_cond[1,i]]] = setdiff(genes_list[[comb_cond[1,i]]], genes_list[[comb_cond[2,i]]])
  genes_conds[[comb_cond[2,i]]] = setdiff(genes_list[[comb_cond[2,i]]], genes_list[[comb_cond[1,i]]])
  all_genes_conds[[comp]] = genes_conds
  
  # put the genes in a data frame
  df_cor = hep_cor_list[[comp]]
  df_cor[,comb_cond[1,i]] = df_cor$`rownames(x)` %in% genes_conds[[comb_cond[1,i]]]
  df_cor[,comb_cond[2,i]]  = df_cor$`rownames(x)` %in% genes_conds[[comb_cond[2,i]]]
  df_cor$common = df_cor$`rownames(x)` %in% genes_conds$common
  colnames(df_cor)[1:2] = c("gene", "corr")
  
  # lists of gene groups to compare - most important is "all"
  gene_groups_list = list(common = as.character(df_cor$gene[df_cor$common]),
                          all = as.character(df_cor$gene[(df_cor[[comb_cond[2,i]]] |
                                                            df_cor[[comb_cond[1,i]]] |
                                                            df_cor$common)]))
  gene_groups_list[[comb_cond[1,i]]] = as.character(df_cor$gene[df_cor[[comb_cond[1,i]]]])
  gene_groups_list[[comb_cond[2,i]]] = as.character(df_cor$gene[df_cor[[comb_cond[2,i]]]])
  
  # get enriched go terms/pathways
  terms_groups = list()
  for(n in names(gene_groups_list)){
    # convert gene IDs
    eg = clusterProfiler::bitr(gene_groups_list[[n]], fromType="SYMBOL", 
                               toType="ENTREZID", OrgDb="org.Hs.eg.db")
    eg_all = clusterProfiler::bitr(as.character(df_cor$gene), fromType="SYMBOL", 
                                   toType="ENTREZID", OrgDb="org.Hs.eg.db")
    
    # enrichment
    reac = ReactomePA::enrichPathway(gene=eg$ENTREZID,pvalueCutoff=0.05, 
                                     readable=T, universe = eg_all$ENTREZID)
    got = clusterProfiler::enrichGO(gene = eg$ENTREZID, universe = eg_all$ENTREZID, 
                                    OrgDb = org.Hs.eg.db, ont = "BP", pvalueCutoff  = 0.01,
                                    qvalueCutoff  = 0.05, 
                                    pAdjustMethod = "BH", readable = T)
    
    all_terms = rbind(as.data.frame(reac), as.data.frame(got))
    all_terms$DB = c(rep("Reactome", nrow(as.data.frame(reac))), 
                     rep("GO Term", nrow(as.data.frame(got))))
    all_terms = all_terms[all_terms$qvalue<=0.05,]
    
    terms_groups[[n]] = all_terms
  }
  terms_groups_list[[comp]] = terms_groups
  df_cor_list[[comp]] = df_cor
}
```

Identify pathways more affected by one condition

```{r}
anot_list = list()
for(n in names(terms_groups_list)){
  terms_groups = terms_groups_list[[n]]
  df_cor = df_cor_list[[n]]
  
  # get the pathways for all genes, minus those with ribosomal genes
  all_path = terms_groups$all[rpfilter(terms_groups$all),]
  all_path = all_path[order(all_path$qvalue, decreasing = F),]
  
  # get gene groups as a data frame
  g_g = unlist(apply(df_cor[,c(1,3,4,5)], 1, 
                     function(x) colnames(df_cor[,c(3,4,5)])[which(as.logical(x[2:4]))]))
  gene_group = data.frame("gene" = names(g_g),
                          "group" = g_g)
  
  # get fraction of genes from a condition that are present in a give pathway
  n_g_group = list()
  n_g_group[[levels(gene_group$group)[1]]] = c()
  n_g_group[[levels(gene_group$group)[2]]] = c()
  n_g_group[[levels(gene_group$group)[3]]] = c()
  for(i in 1:nrow(all_path)){
    genes_p = unlist(strsplit(all_path$geneID[i], "/"))
    for(gg in unique(gene_group$group)){
      ngg = sum(genes_p %in% gene_group$gene[gene_group$group==gg])/sum(gene_group$group==gg)
      n_g_group[[gg]] = c(n_g_group[[gg]], ngg)
    }
  }
  all_path = cbind(all_path, data.frame(n_g_group))
  all_path$diff = all_path[,13] - all_path[,12] # add difference between cond and healthy
  all_path$highest = colnames(all_path)[11:13][apply(all_path[,11:13], 1, which.max)]
  
  # group the detected pathways by similarity
  go_path = all_path[all_path$DB=="GO Term",]
  mat_path = matrix(0, nrow(go_path), nrow(go_path))
  for(i in 1:nrow(go_path)){
    g1 = go_path[i,"geneID"]
    g1 = unlist(strsplit(g1, "/"))
    for(j in 1:nrow(go_path)){
        g2 = go_path[j,"geneID"]
        g2 = unlist(strsplit(g2, "/"))
        mat_path[i,j] = length(intersect(g1, g2))/mean(c(length(g1), length(g2)))
    }
  }
  cl_go = hclust(dist(mat_path), method = "ward.D2")
  cl_go = cutree(cl_go, 7)
  rownames(mat_path) = go_path$Description
  anot = data.frame(row.names = go_path$Description, 
                    "cl" = paste0("cl",cl_go), 
                    "n" = go_path$Count,
                    "diff" = go_path$diff,
                    "qval" = -log10(go_path$qvalue),
                    "term" = go_path$Description)
  pheatmap::pheatmap(mat_path, clustering_method = "ward.D2", 
                     annotation_row = anot[,-5], show_rownames = F)
  genes_per_gocl = tapply(anot$term, anot$cl, 
         function(x) unique(unlist(strsplit(go_path[go_path$Description %in% x,"geneID"], "/"))))
  genes_cond_gocl = lapply(genes_per_gocl, function(x) lapply(genes_conds, function(y) sum(y %in% x)))
  genes_cond_gocl = Reduce(rbind, genes_cond_gocl)
  rownames(genes_cond_gocl) = paste0("cl", 1:7)
  
  anot_list[[n]] = anot
}
```

Plot top terms

```{r}
pdf("results/zonation_cond/hep_top_go_terms_genes.pdf", 
    height = 6, width = 7, useDingbats = F)
for(n in names(terms_groups_list)){
  terms_groups = terms_groups_list[[n]]
  
  for(cond in names(terms_groups)){
    plot_df = terms_groups[[cond]][rpfilter(terms_groups[[cond]]),]
    plot_df = plot_df[order(plot_df$qvalue, decreasing = F),]
    plot_df$Description = factor(plot_df$Description, levels = rev(plot_df$Description))
    
    plt = ggplot(plot_df[1:10,], aes(x = Description, y = -log10(qvalue)))+
      geom_bar(stat = "identity")+
      coord_flip()+
      labs(title = paste0(n, ": ", cond))+
      theme_bw()
    print(plt)
  }
}
dev.off()
```

Plot some genes

```{r}
# plot all genes from pathway in grey, average in colour for cond and healthy
plot_list = list()
exp_plt_list = list()
for(comp in names(terms_groups_list)){
  print(comp)
  c1 = strsplit(comp, "_")[[1]][1]
  c2 = strsplit(comp, "_")[[1]][2]
  
  plot_list[[comp]] = list()
  exp_plt_list[[comp]] = list()
  for(pw in c(c1, c2)){
    goterms = terms_groups_list[[comp]][[pw]]
    goterms = goterms[goterms$DB=="GO Term" & rpfilter(goterms),]
    
    exp_plt_list[[comp]][[pw]] = list()
    plot_list[[comp]][[pw]] = list()
    for(go in 1:nrow(goterms)){
      genes_path = unlist(strsplit(goterms[go,"geneID"],"/"))
      
      exp_healthy = reshape2::melt(hep_fits_list[[c1]][,genes_path])
      exp_healthy$xx = rep(1:100, length(genes_path))
      exp_healthy$cond = c1
      
      exp_cond = reshape2::melt(hep_fits_list[[c2]][,genes_path])
      exp_cond$xx = rep(1:100, length(genes_path))
      exp_cond$cond = c2

      exp_df = rbind(exp_healthy, exp_cond)

      plt = ggplot(exp_df, aes(x = xx, y = value, group = interaction(variable, cond)))+
        #geom_line(mapping = aes(colour = cond), alpha = 0.2)+
        stat_smooth(mapping = aes(group = cond, colour = cond), show.legend = T)+
        labs(title = goterms[go,"Description"], 
             subtitle = paste0(c1, " vs ", c2, ", ", pw, " pathways"))+
        theme_classic()
      
      plot_list[[comp]][[pw]][[goterms[go,"Description"]]] = plt
      exp_plt_list[[comp]][[pw]][[goterms[go,"Description"]]] = exp_df
    }
  }
}
```


```{r, fig.height=2.7, fig.width=4}
pathway = "negative regulation of apoptotic signaling pathway"
unique(exp_plt_list$healthy_regenerating$regenerating[[pathway]]$variable)
gn = "ICAM1"
plot_df = exp_plt_list$healthy_regenerating$regenerating[[pathway]]
plot_df = plot_df[plot_df$variable==gn,]
c1 = "healthy"
c2 = "regenerating"
pw = "regenerating"
ggplot(plot_df, aes(x = xx, y = value, group = interaction(variable, cond)))+
    geom_line(mapping = aes(colour = cond))+
    labs(title = pathway, subtitle = paste0(c1, " vs ", c2, ", ", pw, " pathways"), 
         y = paste0("log(norm exp) ", gn))+
    theme_classic()
```

Save important lists

```{r}
save(terms_groups_list, anot_list, df_cor_list, genes_list, plot_list, exp_plt_list,
     file = "results/zonation_cond/hep_go_results_rank.RData")
```


## Endothelial cells
Get the top genes for each condition

```{r}
# get signifficant genes showing variation (at least 0.1 range in the fitted values)
genes_list = list()
for(n in names(end_fits_list)){
  h_fc_sp = apply(end_fits_list[[n]], 2, function(x) diff(range(x)))
  genes_c = names(end_qvals_list[[n]])[end_qvals_list[[n]]<=0.05 &
                    names(end_qvals_list[[n]]) %in% names(h_fc_sp)[h_fc_sp>=0.1]]
  genes_c = genes_c[genes_c %in% names(end_qvals_list[[n]])[order(end_qvals_list[[n]], decreasing = F)]]
  genes_list[[n]] = genes_c
}
```

Find enriched GO Terms and Reactome pathways

```{r}
comb_cond = combn(names(end_fits_list), 2)
all_genes_conds = list()
terms_groups_list = list()
df_cor_list = list()
for(i in 1:ncol(comb_cond)){
  comp = paste0(comb_cond[1,i],"_", comb_cond[2,i])
  print(comb_cond[,i])
  genes_conds = list("common" = intersect(genes_list[[comb_cond[1,i]]], genes_list[[comb_cond[2,i]]]))
  genes_conds[[comb_cond[1,i]]] = setdiff(genes_list[[comb_cond[1,i]]], genes_list[[comb_cond[2,i]]])
  genes_conds[[comb_cond[2,i]]] = setdiff(genes_list[[comb_cond[2,i]]], genes_list[[comb_cond[1,i]]])
  all_genes_conds[[comp]] = genes_conds
  
  # put the genes in a data frame
  df_cor = end_cor_fit_list[[comp]]
  df_cor[,comb_cond[1,i]] = df_cor$`rownames(x)` %in% genes_conds[[comb_cond[1,i]]]
  df_cor[,comb_cond[2,i]]  = df_cor$`rownames(x)` %in% genes_conds[[comb_cond[2,i]]]
  df_cor$common = df_cor$`rownames(x)` %in% genes_conds$common
  colnames(df_cor)[1:2] = c("gene", "corr")
  
  # lists of gene groups to compare - most important is "all"
  gene_groups_list = list(common = as.character(df_cor$gene[df_cor$common]),
                          all = as.character(df_cor$gene[(df_cor[[comb_cond[2,i]]] |
                                                            df_cor[[comb_cond[1,i]]] |
                                                            df_cor$common)]))
  gene_groups_list[[comb_cond[1,i]]] = as.character(df_cor$gene[df_cor[[comb_cond[1,i]]]])
  gene_groups_list[[comb_cond[2,i]]] = as.character(df_cor$gene[df_cor[[comb_cond[2,i]]]])
  gene_groups_list[[paste0(comb_cond[1,i], "_all")]] = c(gene_groups_list$common,
                                                         as.character(df_cor$gene[df_cor[[comb_cond[1,i]]]]))
  gene_groups_list[[paste0(comb_cond[2,i], "_all")]] = c(gene_groups_list$common,
                                                         as.character(df_cor$gene[df_cor[[comb_cond[2,i]]]]))
  
  # get enriched go terms/pathways
  terms_groups = list()
  eg_all = clusterProfiler::bitr(as.character(df_cor$gene), fromType="SYMBOL", 
                                   toType="ENTREZID", OrgDb="org.Hs.eg.db")
  for(n in names(gene_groups_list)){
    # convert gene IDs
    eg = clusterProfiler::bitr(gene_groups_list[[n]], fromType="SYMBOL", 
                               toType="ENTREZID", OrgDb="org.Hs.eg.db")
    
    # enrichment
    reac = ReactomePA::enrichPathway(gene = eg$ENTREZID, pvalueCutoff=0.05, 
                                     readable=T, universe = eg_all$ENTREZID)
    got = clusterProfiler::enrichGO(gene = eg$ENTREZID, universe = eg_all$ENTREZID, 
                                    OrgDb = org.Hs.eg.db, ont = "BP",
                                    pvalueCutoff  = 0.01, qvalueCutoff  = 0.05, 
                                    pAdjustMethod = "BH", readable = T)
    
    all_terms = rbind(as.data.frame(reac), as.data.frame(got))
    all_terms$DB = c(rep("Reactome", nrow(as.data.frame(reac))), 
                     rep("GO Term", nrow(as.data.frame(got))))
    all_terms = all_terms[all_terms$qvalue<=0.05,]
    
    terms_groups[[n]] = all_terms
  }
  terms_groups_list[[comp]] = terms_groups
  df_cor_list[[comp]] = df_cor
}
```

Identify pathways more affected by one condition

```{r}
anot_list = list()
for(n in names(terms_groups_list)){
  terms_groups = terms_groups_list[[n]]
  df_cor = df_cor_list[[n]]
  
  # get the pathways for all genes, minus those with ribosomal genes
  all_path = terms_groups$all[rpfilter(terms_groups$all),]
  all_path = all_path[order(all_path$qvalue, decreasing = F),]
  
  # get gene groups as a data frame
  g_g = unlist(apply(df_cor[,c(1,3,4,5)], 1, 
                     function(x) colnames(df_cor[,c(3,4,5)])[which(as.logical(x[2:4]))]))
  gene_group = data.frame("gene" = names(g_g),
                          "group" = g_g)
  
  # get fraction of genes from a condition that are present in a give pathway
  n_g_group = list()
  n_g_group[[levels(gene_group$group)[1]]] = c()
  n_g_group[[levels(gene_group$group)[2]]] = c()
  n_g_group[[levels(gene_group$group)[3]]] = c()
  for(i in 1:nrow(all_path)){
    genes_p = unlist(strsplit(all_path$geneID[i], "/"))
    for(gg in unique(gene_group$group)){
      ngg = sum(genes_p %in% gene_group$gene[gene_group$group==gg])/sum(gene_group$group==gg)
      n_g_group[[gg]] = c(n_g_group[[gg]], ngg)
    }
  }
  all_path = cbind(all_path, data.frame(n_g_group))
  all_path$diff = all_path[,13] - all_path[,12] # add difference between cond and healthy
  all_path$highest = colnames(all_path)[11:13][apply(all_path[,11:13], 1, which.max)]
  
  # group the detected pathways by similarity
  go_path = all_path[all_path$DB=="GO Term",]
  mat_path = matrix(0, nrow(go_path), nrow(go_path))
  for(i in 1:nrow(go_path)){
    g1 = go_path[i,"geneID"]
    g1 = unlist(strsplit(g1, "/"))
    for(j in 1:nrow(go_path)){
        g2 = go_path[j,"geneID"]
        g2 = unlist(strsplit(g2, "/"))
        mat_path[i,j] = length(intersect(g1, g2))/mean(c(length(g1), length(g2)))
    }
  }
  cl_go = hclust(dist(mat_path), method = "ward.D2")
  cl_go = cutree(cl_go, 7)
  rownames(mat_path) = go_path$Description
  anot = data.frame(row.names = go_path$Description, 
                    "cl" = paste0("cl",cl_go), 
                    "n" = go_path$Count,
                    "diff" = go_path$diff,
                    "qval" = -log10(go_path$qvalue),
                    "term" = go_path$Description)
  pheatmap::pheatmap(mat_path, clustering_method = "ward.D2", 
                     annotation_row = anot[,-5], show_rownames = F)
  genes_per_gocl = tapply(anot$term, anot$cl, 
         function(x) unique(unlist(strsplit(go_path[go_path$Description %in% x,"geneID"], "/"))))
  genes_cond_gocl = lapply(genes_per_gocl, function(x) lapply(genes_conds, function(y) sum(y %in% x)))
  genes_cond_gocl = Reduce(rbind, genes_cond_gocl)
  rownames(genes_cond_gocl) = paste0("cl", 1:7)
  
  anot_list[[n]] = anot
}
```

Plot top terms

```{r}
pdf("results/zonation_cond/end_top_go_terms_genes.pdf", 
    height = 6, width = 7, useDingbats = F)
for(n in names(terms_groups_list)){
  terms_groups = terms_groups_list[[n]]
  
  for(cond in names(terms_groups)){
    plot_df = terms_groups[[cond]][rpfilter(terms_groups[[cond]]),]
    plot_df = plot_df[order(plot_df$qvalue, decreasing = F),]
    plot_df$Description = factor(plot_df$Description, levels = rev(plot_df$Description))
    
    plt = ggplot(plot_df[1:10,], aes(x = Description, y = -log10(qvalue)))+
      geom_bar(stat = "identity")+
      coord_flip()+
      labs(title = paste0(n, ": ", cond))+
      theme_bw()
    print(plt)
  }
}
dev.off()
```

Plot some genes

```{r}
# plot all genes from pathway in grey, average in colour for cond and healthy
plot_list = list()
exp_plt_list = list()
for(comp in names(terms_groups_list)){
  print(comp)
  c1 = strsplit(comp, "_")[[1]][1]
  c2 = strsplit(comp, "_")[[1]][2]
  
  plot_list[[comp]] = list()
  exp_plt_list[[comp]] = list()
  for(pw in c(c1, c2)){
    goterms = terms_groups_list[[comp]][[pw]]
    goterms = goterms[goterms$DB=="GO Term" & rpfilter(goterms),]
    
    exp_plt_list[[comp]][[pw]] = list()
    plot_list[[comp]][[pw]] = list()
    if(nrow(goterms)>0){
      for(go in 1:nrow(goterms)){
        genes_path = unlist(strsplit(goterms[go,"geneID"],"/"))
        
        exp_healthy = reshape2::melt(end_fits_list[[c1]][,genes_path])
        exp_healthy$xx = rep(1:100, length(genes_path))
        exp_healthy$cond = c1
        
        exp_cond = reshape2::melt(end_fits_list[[c2]][,genes_path])
        exp_cond$xx = rep(1:100, length(genes_path))
        exp_cond$cond = c2
  
        exp_df = rbind(exp_healthy, exp_cond)
  
        plt = ggplot(exp_df, aes(x = xx, y = value, group = interaction(variable, cond)))+
          #geom_line(mapping = aes(colour = cond), alpha = 0.2)+
          stat_smooth(mapping = aes(group = cond, colour = cond), show.legend = T)+
          labs(title = goterms[go,"Description"], 
               subtitle = paste0(c1, " vs ", c2, ", ", pw, " pathways"))+
          theme_classic()
        
        plot_list[[comp]][[pw]][[goterms[go,"Description"]]] = plt
        exp_plt_list[[comp]][[pw]][[goterms[go,"Description"]]] = exp_df
      }
    }
  }
}
```


```{r, fig.height=2.7, fig.width=4}
pathway = "cellular response to type I interferon"
unique(exp_plt_list$healthy_regenerating$regenerating[[pathway]]$variable)
gn = "EGR1"
plot_df = exp_plt_list$healthy_regenerating$regenerating[[pathway]]
plot_df = plot_df[plot_df$variable==gn,]
c1 = "healthy"
c2 = "regenerating"
pw = "regenerating"
ggplot(plot_df, aes(x = xx, y = value, group = interaction(variable, cond)))+
    geom_line(mapping = aes(colour = cond))+
    labs(title = pathway, subtitle = paste0(c1, " vs ", c2, ", ", pw, " pathways"), 
         y = paste0("log(norm exp) ", gn))+
    theme_classic()
```

Save important lists

```{r}
save(terms_groups_list, anot_list, df_cor_list, plot_list, exp_plt_list,
     file = "results/zonation_cond/end_go_results_rank.RData")
```













# OLD CODE
Find genes with differencial variation - not very interesting since most genes seem to be showing a variation

```{r}
cells = unlist(lapply(hep_cells, colnames))
genes = unique(unlist(lapply(hep_cells, VariableFeatures)))
exp_hep = allcells_css[genes,cells]@assays$SCT@data
genes = rownames(exp_hep)

df_meta = data.frame("pt" = c(df_dc$DC1, emb_pred, reg_pred),
                     "cond" = allcells_css[genes,cells]@meta.data$Condition)

# Fit GAM for each gene using pseudotime as independent variable.
t = df_meta$pt
cond = df_meta$cond
gene_fit_int_p = matrix(nrow = 0, ncol = 3)
colnames(gene_fit_int_p) = c("spline", "condition", "interaction")
gene_fit_int_vals = data.frame(row.names = 1:300)
predgrid = data.frame("t" = rep(seq(min(t), max(t), length.out = 100), 3),
                      "cond" = rep(names(hep_cells), each = 100))
for(i in genes){
  z = exp_hep[i,]
  
  d = data.frame(z=z, t=t, cond=cond)
  tmp = suppressMessages(gam(z ~ ns(t, df = 4)*cond, data=d))
  
  # bins for model fitting
  gene_fit_int_vals[,g] = suppressMessages(predict(tmp, newdata = predgrid))
  p = summary(tmp)$parametric.anova$`Pr(>F)`[1:3]
  gene_fit_int_p = rbind(gene_fit_int_p, p)
  rownames(gene_fit_int_p)[nrow(gene_fit_int_p)] = i
}
```

DTW condition vs condition

```{r}
cond_comb = combn(names(qvals_list), 2)
for(i in 1:ncol(cond_comb)){
  genes = intersect(names(qvals_list[[cond_comb[1,i]]])[qvals_list[[cond_comb[1,i]]]<=0.05],
                    names(qvals_list[[cond_comb[2,i]]])[qvals_list[[cond_comb[2,i]]]<=0.05])
  
  sc_list = list()
  for(cond in cond_comb[,i]){
    gene_fit_vals = fits_list[[cond]]
    gene_fit_vals_filt = t(gene_fit_vals[,genes])
    sc_list[[cond]] = t(apply(gene_fit_vals_filt, 1, scale))
  }
  
  xxx = dtw(sc_list[[1]], sc_list[[2]], window.type = "itakura", dist.method = "DTW")
  yyy = dtw(dist(t(sc_list[[1]]), t(sc_list[[2]]), method = "DTW"), window.type = "itakura")
}
```



```{r}
source("aux_scripts/primate_cerebral_organoids_pt_alignment.r")

xcv = align_pt_traj(expr_ref = hep_cells$healthy@assays$SCT@data[genes,], 
                    pt_ref = traj_list$healthy,
                    expr_query = hep_cells$embolised@assays$SCT@data[genes,], 
                    pt_query = traj_list$embolised, num_breaks_ref = 100, 
                    ref_name = "healthy", query_name = "embolised")

plot(traj_list$embolised, xcv$pt_query_aligned)
abline(0,1)

xcb = align_pt_traj(expr_ref = hep_cells$healthy@assays$SCT@data[genes,], 
                    pt_ref = traj_list$healthy,
                    expr_query = hep_cells$regenerating@assays$SCT@data[genes,], 
                    pt_query = traj_list$regenerating, num_breaks_ref = 100, 
                    ref_name = "healthy", query_name = "regenerating")

plot(traj_list$regenerating, xcb$pt_query_aligned)
abline(0,1)
```

```{r}
g = "CLEC1B"
plt_df = data.frame("traj" = c(traj_list$regenerating, traj_list$healthy),
           "exp" = c(end_cells$regenerating@assays$SCT@data[g,names(traj_list$regenerating)],
           end_cells$healthy@assays$SCT@data[g,]),
           "cond" = c(rep("regenerating", length(traj_list$regenerating)),
                      rep("healthy", length(traj_list$healthy))),
           "cl" = c(end_cells$regenerating@meta.data$allcells_simp,
                    end_cells$healthy@meta.data$allcells_simp))
plt_df = plt_df[order(plt_df$traj, decreasing = T),]

ggplot(plt_df, aes(x = traj, y = exp, group = cond, colour = cond))+
  geom_point()+
  stat_smooth()+
  labs(title = g)+
  theme_bw()

ggplot(plt_df, aes(x = traj, y = exp, group = cond, colour = cl, shape = cond))+
  geom_point()+
  stat_smooth()+
  labs(title = g)+
  theme_bw()

xxx = dtw::dtw(x = plt_df[plt_df$cond=="healthy","exp"], y = plt_df[plt_df$cond=="regenerating","exp"])
plot(xxx)
abline(0,1)

hti = cut_number(plt_df$traj[plt_df$cond=="healthy"], 100)
rti = cut_number(plt_df$traj[plt_df$cond=="regenerating"], 100)
df2 = data.frame("r" = tapply(plt_df$exp[plt_df$cond=="regenerating"], rti, mean),
                 "h" = tapply(plt_df$exp[plt_df$cond=="healthy"], hti, mean))
yyy = dtw::dtw(x = df2$h, y = df2$r)
plot(yyy)
abline(0,1)
```




Get GO Term enrichment

```{r}
library(topGO)
cor_lists = list("hep" = hep_cor_list, "end" = end_cor_list)

cc = -0.25
go_enr_list = list()
for(gr in names(cor_lists)){
  cor_list = cor_lists[[gr]]

  go_enr_list[[gr]] = list()
  for(comp in names(cor_list)){
    testdf = cor_list[[comp]]
    
    
    
    ncond = strsplit(comp, "_")[[1]]
    go_enr_list[[gr]][[comp]] = list()
    for(cond in ncond){
      gsig = names(hep_qval[[cond]])[hep_qval[[cond]]<=0.05]
      
      subtestdf = merge(testdf, hep_qval[[cond]], by = 0)
      
      genes = subtestdf$X..i..
      names(genes) = subtestdf[,1]
      
      # GO enrichment
      fsel = function(x) return(x<cc & subtestdf$y<=0.05)
      
      sampleGOdata <- new("topGOdata",
                          description = "test", ontology = "BP",
                          allGenes = genes, geneSel = fsel,
                          nodeSize = 5, annot = annFUN.org, 
                          mapping = "org.Hs.eg.db", ID = "symbol")
      
      resultKS.elim <- runTest(sampleGOdata, algorithm = "elim", 
                               statistic = "fisher")
      
      resGO = GenTable(object = sampleGOdata, elimKS = resultKS.elim,
                       topNodes = resultKS.elim@geneData[4], numChar = 1000)
      resGO$elimKS = score(resultKS.elim)[resGO$GO.ID]
      resGO$cond = cond
      
      # get genes
      relevant_genes = sampleGOdata@allGenes[sampleGOdata@geneSelectionFun(sampleGOdata@allScores)]
      goGenes = t(data.frame(lapply(genesInTerm(sampleGOdata), function(x) paste(x[x %in% relevant_genes], collapse = ", "))))
      rownames(goGenes) = gsub(".", ":", rownames(goGenes), fixed = T)
      colnames(goGenes) = "genes"
      resGO = merge(resGO, goGenes, by.x = 1, by.y = 0, all.x = T)
      
      if(cond==ncond[[1]]){
        go_enr_list[[gr]][[comp]] = resGO[,c("GO.ID","Term","Annotated","Significant",
                                             "Expected","elimKS","cond","genes")]
      } else{
        go_enr_list[[gr]][[comp]] = rbind(go_enr_list[[gr]][[comp]],
                                          resGO[,c("GO.ID","Term","Annotated","Significant",
                                                   "Expected","elimKS","cond","genes")])
      }
      
    }
  }
}
saveRDS(go_enr_list, file = "results/zonation_cond/go_enr_corsig_list.RDS")
```


```{r}
gg = unique(c(names(end_qval$healthy)[order(end_qval$healthy, decreasing = F)][1:1000],
              names(end_qval$regenerating)[order(end_qval$regenerating, decreasing = F)][1:1000]))
diff_fits = apply(end_fits$healthy[,gg], 2, rank)-apply(end_fits$regenerating[,gg], 2, rank)
cl = hclust(dist(t(diff_fits)), method = "ward.D")
ct = cutree(cl, 6)
df = data.frame("cor" = end_cor_fit_list$healthy_regenerating[gg,2],
                 "score" = apply(apply(t(diff_fits), 1, range), 
                                 2, function(x) sum(abs(x))))
diff_genes = rownames(df)[df$cor<0.25]
cl = hclust(dist(t(diff_fits)[diff_genes,]), method = "ward.D")
ct = cutree(cl, 4)
df = data.frame("cl" = factor(ct),
                "cor" = 1-end_cor_fit_list$healthy_regenerating[diff_genes,2],
                 "score" = apply(apply(t(diff_fits)[diff_genes,], 1, range), 
                                 2, function(x) sum(abs(x))))
pheatmap::pheatmap(t(diff_fits)[diff_genes,], cluster_cols = F, 
                   clustering_method = "ward.D", fontsize_row = 8, annotation_row = df)

plot(hep_fits$healthy[,"SAA2"])
plot(hep_fits$regenerating[,"SAA2"])
```

```{r}
fsel = function(x) return(x<0.25 & names(genes) %in% gg)
fsel = function(x) return(names(genes) %in% gg)
genes = end_cor_fit_list$healthy_regenerating$X..i..
names(genes) = end_cor_fit_list$healthy_regenerating[,1]
genes = genes[!is.na(genes)]

eg = clusterProfiler::bitr(names(genes[fsel(genes)]), fromType="SYMBOL", 
                           toType="ENTREZID", OrgDb="org.Hs.eg.db")
eg_all = clusterProfiler::bitr(names(genes), fromType="SYMBOL", 
                           toType="ENTREZID", OrgDb="org.Hs.eg.db")

x <- ReactomePA::enrichPathway(gene=eg$ENTREZID,pvalueCutoff=0.05, 
                               readable=T, universe = eg_all$ENTREZID)
clusterProfiler::dotplot(x)
head(as.data.frame(x))

ego3 <- clusterProfiler::enrichGO(gene = eg$ENTREZID, universe = eg_all$ENTREZID, 
                               OrgDb = org.Hs.eg.db, ont = "BP", pvalueCutoff  = 0.01,
                               qvalueCutoff  = 0.05, 
                               pAdjustMethod = "BH", readable = T)
clusterProfiler::dotplot(ego3)
```



```{r}
gg = unique(c(names(qvals_b_list$healthy)[order(qvals_b_list$healthy, decreasing = F)][1:1000],
              names(qvals_b_list$regenerating)[order(qvals_b_list$regenerating, decreasing = F)][1:1000]))
diff_fits = apply(fits_b_list$healthy[,gg], 2, rank)-apply(fits_b_list$regenerating[,gg], 2, rank)
cl = hclust(dist(t(diff_fits)), method = "ward.D")
ct = cutree(cl, 6)
df = data.frame("cor" = hep_cor_fit_list$healthy_regenerating[gg,2],
                 "score" = apply(apply(t(diff_fits), 1, range), 
                                 2, function(x) sum(abs(x))))
df = df[complete.cases(df),]
diff_genes = rownames(df)[df$cor<0.25]
cl = hclust(dist(t(diff_fits)[diff_genes,]), method = "ward.D")
ct = cutree(cl, 4)
df = data.frame("cl" = factor(ct),
                "cor" = 1-hep_cor_fit_list$healthy_regenerating[diff_genes,2],
                 "score" = apply(apply(t(diff_fits)[diff_genes,], 1, range), 
                                 2, function(x) sum(abs(x))))
pheatmap::pheatmap(t(diff_fits)[diff_genes,], cluster_cols = F, 
                   clustering_method = "ward.D", fontsize_row = 8, annotation_row = df)

plot(hep_fits$healthy[,"CPS1"])
plot(hep_fits$regenerating[,"CPS1"])
```


```{r}
genes_healthy = names(qvals_b_list$healthy)[order(qvals_b_list$healthy, decreasing = F)][qvals_b_list$healthy[order(qvals_b_list$healthy, decreasing = F)]<=0.05]
genes_regen = names(qvals_b_list$regenerating)[order(qvals_b_list$regenerating, decreasing = F)][qvals_b_list$regenerating[order(qvals_b_list$regenerating, decreasing = F)]<=0.05]
gg = unique(c(genes_healthy, genes_regen))

eg = clusterProfiler::bitr(gg, fromType="SYMBOL", 
                           toType="ENTREZID", OrgDb="org.Hs.eg.db")
eg_all = clusterProfiler::bitr(names(qvals_b_list$healthy), fromType="SYMBOL", 
                           toType="ENTREZID", OrgDb="org.Hs.eg.db")

reac = ReactomePA::enrichPathway(gene=eg$ENTREZID,pvalueCutoff=0.05, 
                               readable=T, universe = eg_all$ENTREZID)

got = clusterProfiler::enrichGO(gene = eg$ENTREZID, universe = eg_all$ENTREZID, 
                                OrgDb = org.Hs.eg.db, ont = "BP", pvalueCutoff  = 0.01,
                                qvalueCutoff  = 0.05, 
                                pAdjustMethod = "BH", readable = T)

all_terms = rbind(as.data.frame(reac), as.data.frame(got))
all_terms$DB = c(rep("Reactome", nrow(as.data.frame(reac))), rep("GO Term", nrow(as.data.frame(got))))
all_terms = all_terms[all_terms$qvalue<=0.05,]
all_terms_noribo = all_terms[!grepl("RPL", all_terms$geneID) & !grepl("RPS", all_terms$geneID),]

nhealthy = c()
nregen = c()
corhealthy = c()
corregen = c()
mean_traj_h = matrix(0, nrow(all_terms_noribo), 100)
mean_traj_r = matrix(0, nrow(all_terms_noribo), 100)
cor_mean = c()
for(i in 1:nrow(all_terms_noribo)){
  genes_path = unlist(strsplit(all_terms_noribo[i,"geneID"],"/"))
  in_healthy = genes_path[genes_path %in% genes_healthy]
  in_regen = genes_path[genes_path %in% genes_regen]
  
  nhealthy = c(nhealthy, length(in_healthy))
  nregen = c(nregen, length(in_regen))
  
  if(length(in_healthy)>1){
    ch = cor(fits_b_list$healthy[,in_healthy], method = "sp")
    corhealthy = c(corhealthy, median(ch[upper.tri(ch)])) 
  } else{ 
    corhealthy = c(corhealthy,1)
  }
  if(length(in_regen)>1){
    ch = cor(fits_b_list$regen[,in_regen], method = "sp")
    corregen = c(corregen, median(ch[upper.tri(ch)])) 
  } else{ 
    corregen = c(corregen,1)
  }
  
  mean_traj_h[i,] = if(length(in_healthy)>1){ rowMeans(fits_b_list$healthy[,in_healthy])} else if(length(in_healthy)>0){fits_b_list$healthy[,in_healthy]} else {rep(0,100)}
  mean_traj_r[i,] = if(length(in_regen)>1){ rowMeans(fits_b_list$regenerating[,in_regen])} else if(length(in_regen)>0){fits_b_list$regenerating[,in_regen]} else {rep(0,100)}
  
  cor_mean = c(cor_mean, cor(mean_traj_h[i,], mean_traj_r[i,]))
}
all_terms_noribo = cbind(all_terms_noribo, nhealthy, nregen, corhealthy, corregen, cor_mean)
rownames(mean_traj_h) = rownames(mean_traj_r) = all_terms_noribo$Description


View(all_terms_noribo[all_terms_noribo$corhealthy>=0.25 & all_terms_noribo$corregen>=0.25 &
                        all_terms_noribo$nhealthy>=2 & all_terms_noribo$nregen>=2,])
i = which(all_terms_noribo$Description=="mRNA Splicing - Minor Pathway")
```



```{r}
genes_healthy = names(qvals_b_list$healthy)[qvals_b_list$healthy<=0.05]
genes_regen = names(qvals_b_list$regenerating)[qvals_b_list$regenerating<=0.05]
genes_conds = list("healthy" = setdiff(genes_healthy, genes_regen), 
                   "regen" = setdiff(genes_regen, genes_healthy),
                   "common" = intersect(genes_healthy, genes_regen))

terms_list = list()
for(n in names(genes_conds)){
  eg = clusterProfiler::bitr(genes_conds[[n]], fromType="SYMBOL", 
                           toType="ENTREZID", OrgDb="org.Hs.eg.db")
  eg_all = clusterProfiler::bitr(names(qvals_b_list$healthy), fromType="SYMBOL", 
                             toType="ENTREZID", OrgDb="org.Hs.eg.db")
  
  reac = ReactomePA::enrichPathway(gene=eg$ENTREZID,pvalueCutoff=0.05, 
                                 readable=T, universe = eg_all$ENTREZID)
  
  got = clusterProfiler::enrichGO(gene = eg$ENTREZID, universe = eg_all$ENTREZID, 
                                  OrgDb = org.Hs.eg.db, ont = "BP", pvalueCutoff  = 0.01,
                                  qvalueCutoff  = 0.05, 
                                  pAdjustMethod = "BH", readable = T)
  
  all_terms = rbind(as.data.frame(reac), as.data.frame(got))
  all_terms$DB = c(rep("Reactome", nrow(as.data.frame(reac))), rep("GO Term", nrow(as.data.frame(got))))
  all_terms = all_terms[all_terms$qvalue<=0.05,]
  all_terms_noribo = all_terms[!grepl("RPL", all_terms$geneID) & !grepl("RPS", all_terms$geneID),]
  
  nhealthy = c()
  nregen = c()
  corhealthy = c()
  corregen = c()
  mean_traj_h = matrix(0, nrow(all_terms_noribo), 100)
  mean_traj_r = matrix(0, nrow(all_terms_noribo), 100)
  cor_mean = c()
  for(i in 1:nrow(all_terms_noribo)){
    genes_path = unlist(strsplit(all_terms_noribo[i,"geneID"],"/"))
    in_healthy = genes_path[genes_path %in% genes_healthy]
    in_regen = genes_path[genes_path %in% genes_regen]
    
    nhealthy = c(nhealthy, length(in_healthy))
    nregen = c(nregen, length(in_regen))
    
    if(length(in_healthy)>1){
      ch = cor(fits_b_list$healthy[,in_healthy], method = "sp")
      corhealthy = c(corhealthy, median(ch[upper.tri(ch)])) 
    } else{ 
      corhealthy = c(corhealthy,0)
    }
    if(length(in_regen)>1){
      ch = cor(fits_b_list$regen[,in_regen], method = "sp")
      corregen = c(corregen, median(ch[upper.tri(ch)])) 
    } else{ 
      corregen = c(corregen,0)
    }
    
    mean_traj_h[i,] = if(length(in_healthy)>1){ rowMeans(fits_b_list$healthy[,in_healthy])} else if(length(in_healthy)>0){fits_b_list$healthy[,in_healthy]} else {rep(0,100)}
    mean_traj_r[i,] = if(length(in_regen)>1){ rowMeans(fits_b_list$regenerating[,in_regen])} else if(length(in_regen)>0){fits_b_list$regenerating[,in_regen]} else {rep(0,100)}
    
    cor_mean = c(cor_mean, cor(mean_traj_h[i,], mean_traj_r[i,]))
  }
  all_terms_noribo = cbind(all_terms_noribo, nhealthy, nregen, corhealthy, corregen, cor_mean)
  rownames(mean_traj_h) = rownames(mean_traj_r) = all_terms_noribo$Description
  
  terms_list[[n]] = all_terms_noribo
}
```







```{r}
cl = hclust(dist(t(diff_fits)), method = "ward.D")
ct = cutree(cl, 1)

term_list = list()
for(i in unique(ct)){
  fsel = function(x) return(x<0.25 & names(genes) %in% names(ct)[ct==i])
  genes = hep_cor_fit_list$healthy_regenerating$X..i..
  names(genes) = hep_cor_fit_list$healthy_regenerating[,1]
  genes = genes[!is.na(genes)]
  
  eg = clusterProfiler::bitr(names(genes[fsel(genes)]), fromType="SYMBOL", 
                             toType="ENTREZID", OrgDb="org.Hs.eg.db")
  eg_all = clusterProfiler::bitr(names(genes), fromType="SYMBOL", 
                             toType="ENTREZID", OrgDb="org.Hs.eg.db")
  
  x <- ReactomePA::enrichPathway(gene=eg$ENTREZID,pvalueCutoff=0.05, 
                                 readable=T, universe = eg_all$ENTREZID)
  
  term_list[[paste0("reac_",i)]] = x
  
  ego3 <- clusterProfiler::enrichGO(gene = eg$ENTREZID, universe = eg_all$ENTREZID, 
                                 OrgDb = org.Hs.eg.db, ont = "BP", pvalueCutoff  = 0.01,
                                 qvalueCutoff  = 0.05, 
                                 pAdjustMethod = "BH", readable = T)
  
  term_list[[paste0("go_",i)]] = ego3
}
```




```{r}
genes = hep_cor_fit_list$healthy_regenerating$X..i..
names(genes) = hep_cor_fit_list$healthy_regenerating[,1]
genes = genes[!is.na(genes)]

# GO enrichment
fsel = function(x) return(x<0.25 & names(genes) %in% gg)

sampleGOdata <- new("topGOdata",
                    description = "test", ontology = "BP",
                    allGenes = genes, geneSel = fsel,
                    nodeSize = 5, annot = annFUN.org, 
                    mapping = "org.Hs.eg.db", ID = "symbol")

resultKS.elim <- runTest(sampleGOdata, algorithm = "elim", 
                         statistic = "fisher")

resGO = GenTable(object = sampleGOdata, elimKS = resultKS.elim,
                 topNodes = resultKS.elim@geneData[4], numChar = 1000)
resGO$elimKS = score(resultKS.elim)[resGO$GO.ID]


# get genes
relevant_genes = sampleGOdata@allGenes[sampleGOdata@geneSelectionFun(sampleGOdata@allScores)]
goGenes = t(data.frame(lapply(genesInTerm(sampleGOdata), function(x) paste(x[x %in% relevant_genes], collapse = ", "))))
rownames(goGenes) = gsub(".", ":", rownames(goGenes), fixed = T)
colnames(goGenes) = "genes"
resGO = merge(resGO, goGenes, by.x = 1, by.y = 0, all.x = T)
subresGO = resGO[!(grepl("RPL", resGO$genes) & grepl("RPS", resGO$genes)),]
```

```{r}
sig_resGO = resGO[resGO$elimKS<=0.05,]
hprof_go = list()
rprof_go = list()
for(l in 1:nrow(sig_resGO)){
  genes_go = unlist(strsplit(as.character(sig_resGO[l,"genes"]), ", "))
  
  hprof_go[[sig_resGO[l,2]]] = t(hep_fits$healthy[,genes_go])
  rprof_go[[sig_resGO[l,2]]] = t(hep_fits$regenerating[,genes_go])
}
hmax = unlist(lapply(hprof_go, function(x) which.max(colSums(x))))
rmax = unlist(lapply(rprof_go, function(x) which.max(colSums(x))))
names(hmax) = names(rmax) = sig_resGO$Term
```

```{r}
sig_resGO = sig_resGO[order(sig_resGO$elimKS, decreasing = F),]
go_short_list = list()
for(i in 1:nrow(sig_resGO)){
  g = unlist(strsplit(as.character(sig_resGO[i,"genes"]), ", "))
  if(!any(g %in% unlist(go_short_list)) & length(g)>1){
    go_short_list[[sig_resGO[i,2]]] = g
  }
}

res = matrix(0, 1, 100)
for(n in names(go_short_list)){
  if(length(go_short_list[[n]])>1){
    cl = hclust(dist(t(diff_fits[,go_short_list[[n]]])), method = "ward.D")
    res = rbind(res, t(diff_fits[,go_short_list[[n]]])[cl$order,])
  }
}
res = res[-1,]


pheatmap::pheatmap(res, cluster_cols = F, cluster_rows = F, fontsize_row = 8, gaps_row = cumsum(unlist(lapply(go_short_list, length))))
```

```{r}
sig_resGO = sig_resGO[order(sig_resGO$elimKS, decreasing = F),]
go_med = matrix(0, 1, 100)
for(i in 1:nrow(sig_resGO)){
  g = unlist(strsplit(as.character(sig_resGO[i,"genes"]), ", "))
  if(length(g)>1){
    go_med = rbind(go_med, apply(t(diff_fits[,g]), 2, median))
    rownames(go_med)[nrow(go_med)] = sig_resGO[i,2]
  }
}
go_med = go_med[-1,]
```

```{r}
eg = clusterProfiler::bitr(names(genes[fsel(genes)]), fromType="SYMBOL", 
                           toType="ENTREZID", OrgDb="org.Hs.eg.db")
eg_all = clusterProfiler::bitr(names(genes), fromType="SYMBOL", 
                           toType="ENTREZID", OrgDb="org.Hs.eg.db")

x <- as.data.frame(ReactomePA::enrichPathway(gene=eg$ENTREZID,pvalueCutoff=0.05, 
                                             readable=T, universe = eg_all$ENTREZID))
head(as.data.frame(x))

go_med = matrix(0, 1, 100)
for(i in 1:nrow(x)){
  g = unlist(strsplit(as.character(x[i,"geneID"]), "/"))
  if(length(g)>1){
    go_med = rbind(go_med, apply(t(diff_fits[,g]), 2, median))
    rownames(go_med)[nrow(go_med)] = x[i,2]
  }
}
go_med = go_med[-1,]

genes_reac = strsplit(x$geneID, "/")
names(genes_reac) = x$Description
genes_reac = reshape2::melt(genes_reac)
dat = genes_reac[!duplicated(genes_reac$value),]
dat = data.frame(row.names = dat[,1], "Reac" = dat[,2])
pheatmap::pheatmap(t(diff_fits)[diff_genes,], cluster_cols = F, 
                   clustering_method = "ward.D", fontsize_row = 8, annotation_row = dat)

hprof_re = list()
rprof_re = list()
for(l in 1:nrow(x)){
  genes_go = unlist(strsplit(as.character(x[l,"geneID"]), "/"))
  
  hprof_re[[x[l,2]]] = t(hep_fits$healthy[,genes_go])
  rprof_re[[x[l,2]]] = t(hep_fits$regenerating[,genes_go])
}


hmet = ggplot(reshape2::melt(t(apply(hprof_re$`Metallothioneins bind metals`, 1, scale))), 
              aes(x = Var2, y = value, group = Var1, colour = Var1))+
  geom_line()+labs(title = "Healthy - Metallothioneins bind metals")
rmet = ggplot(reshape2::melt(t(apply(rprof_re$`Metallothioneins bind metals`, 1, scale))), 
              aes(x = Var2, y = value, group = Var1, colour = Var1))+
  geom_line()+labs(title = "Regen - Metallothioneins bind metals")
rlip = ggplot(reshape2::melt(t(apply(rprof_re$`Plasma lipoprotein remodeling`, 1, scale))), 
              aes(x = Var2, y = value, group = Var1, colour = Var1))+
  geom_line()+labs(title = "Regen - Plasma lipoprotein remodeling")
hlip = ggplot(reshape2::melt(t(apply(hprof_re$`Plasma lipoprotein remodeling`, 1, scale))), 
              aes(x = Var2, y = value, group = Var1, colour = Var1))+
  geom_line()+labs(title = "Healthy - Plasma lipoprotein remodeling")
cowplot::plot_grid(hmet, rmet, hlip, rlip, ncol = 2, row = 2, align = "hv")









ego3 <- clusterProfiler::enrichGO(gene = eg$ENTREZID, universe = eg_all$ENTREZID, 
                               OrgDb = org.Hs.eg.db, ont = "BP", pvalueCutoff  = 0.01,
                               qvalueCutoff  = 0.05, 
                               pAdjustMethod = "BH", readable = T)


genes_ent = merge(eg_all, genes, by.x = 1, by.y = 0)
tmp = genes_ent[,2]
genes_ent = genes_ent[,3]
names(genes_ent) = tmp
ego4 <- clusterProfiler::gseGO(geneList = genes_ent[order(genes_ent, decreasing = T)], 
                               OrgDb = org.Hs.eg.db, ont = "BP", nPerm = 1000, minGSSize = 100,
                               maxGSSize = 500, pvalueCutoff = 0.05, verbose = FALSE)
```





```{r}
plot_genes_fits = cbind(hep_fits$healthy[,c("SAA1", "HAMP", "CYP3A4", "GLUL", "CPS1", "ARG1")],
                        hep_fits$embolised[,c("SAA1", "HAMP", "CYP3A4", "GLUL", "CPS1", "ARG1")],
                        hep_fits$regenerating[,c("SAA1", "HAMP", "CYP3A4", "GLUL", "CPS1", "ARG1")])
plot_genes_fits = apply(plot_genes_fits, 2, c)

colnames(plot_genes_fits) = paste0(colnames(plot_genes_fits), 
                                   rep(c("_healthy", "_embolised", "_regenerating"), each = 6))
plot_genes_fits = reshape2::melt(plot_genes_fits)

plot_genes_fits$cond = unlist(lapply(strsplit(as.character(plot_genes_fits$Var2), "_"), function(x) x[2]))

plot_genes_fits$cond = factor(plot_genes_fits$cond, levels = c("healthy", "embolised", "regenerating"))

plot_genes_fits$genes = unlist(lapply(strsplit(as.character(plot_genes_fits$Var2), "_"), function(x) x[1]))

ggplot(plot_genes_fits, aes(x = Var1, y = value, colour = genes))+facet_wrap(~cond)+geom_line()
```


```{r}
all_path = terms_groups$all[rpfilter(terms_groups$all),]
all_path = all_path[order(all_path$qvalue, decreasing = F),]

tab_go = reshape2::melt(lapply(df_cor$gene, function(g) all_path[which(grepl(g, all_path$geneID) & all_path$DB=="GO Term")[1],c("Description", "DB", "qvalue")][1,]))
tab_go$gene = df_cor$gene
tab_go = tab_go[,c(6,1,2,4)]
tab_go$de = apply(df_cor[,c(1,3,4,5)], 1, function(x) colnames(df_cor[,c(3,4,5)])[which(as.logical(x[2:4]))])
tab_go$de[nchar(tab_go$de)>10] = NA
tab_go$de = unlist(tab_go$de)
tab_go$corr = df_cor$corr
tab_re = reshape2::melt(lapply(df_cor$gene, function(g) all_path[which(grepl(g, all_path$geneID) & all_path$DB=="Reactome")[1],c("Description", "DB", "qvalue")][1,]))
tab_re$gene = df_cor$gene
tab_re = tab_re[,c(6,1,2,4)]
tab_re$de = apply(df_cor[,c(1,3,4,5)], 1, function(x) colnames(df_cor[,c(3,4,5)])[which(as.logical(x[2:4]))])
tab_re$de[nchar(tab_re$de)>10] = NA
tab_re$de = unlist(tab_re$de)
tab_re$corr = df_cor$corr

tab_all = rbind(tab_go, tab_re)
tab_all = tab_all[complete.cases(tab_all),]

xxx = table(tab_all$Description, tab_all$de)
xxx = xxx[rowSums(xxx)>2,]
xxx = xxx/rowSums(xxx)
View(apply(xxx, 1, function(x) ifelse(any(x>0.5), colnames(xxx)[x>0.5], NA)))

aaa = cbind(tapply(tab_all$corr, tab_all$Description, function(x) sum(x>=0.25)/length(x)), 
            tapply(tab_all$corr, tab_all$Description, function(x) length(x)),
            tapply(tab_all$de, tab_all$Description, function(x) names(table(x))[which.max(table(x))]))
```

Calculate distances for all significant genes

```{r}
for(cond in names(qvals_list)){
  gene_fit_p = qvals_list[[cond]]
  gene_fit_vals = fits_list[[cond]]
  gene_fit_vals_filt = t(gene_fit_vals[,names(gene_fit_p)[gene_fit_p<=0.05]])
  scale_fit = t(apply(gene_fit_vals_filt, 1, scale))
  
  # dist
  if(file.exists(paste0("results/zonation_cond/dtw_distance_end_", cond, "_sigGenes.RDS"))){
    next
  } else{
    dd = dtw::dtwDist(scale_fit) # TAKES 4.5 HOURS
    saveRDS(dd, paste0("results/zonation_cond/dtw_distance_end_", cond, "_sigGenes.RDS"))
  }
}
```

Cluster genes (per condition)

```{r}
top_n = 1000
for(cond in names(qvals_list)){
  print(cond)
  gene_fit_p = qvals_list[[cond]]
  gene_fit_vals = fits_list[[cond]]
  
  dd = readRDS(paste0("results/zonation_cond/dtw_distance_end_", cond, "_sigGenes.RDS"))
  
  gene_fit_vals_filt = t(gene_fit_vals[,names(gene_fit_p)[gene_fit_p<=0.05]])
  scale_fit = t(apply(gene_fit_vals_filt, 1, scale))
  
  # select top genes (0 if using all)
  topgenes = if(top_n==0) names(gene_fit_p) else names(head(gene_fit_p[order(gene_fit_p,
                                                                             decreasing = F)],
                                                            top_n))
  
  clust_genes = hclust(as.dist(dd[topgenes,topgenes]), method = "ward.D2")
  
  cls = cutree(clust_genes, 4)
  #ape::plot.phylo(ape::as.phylo(clust_genes), tip.color = rainbow(10)[cls])
  plt_list = list()
  for(i in unique(cls)){
    sub_df = scale_fit[names(cls)[cls==i],]
    plot_df = reshape2::melt(sub_df)
    plt_list[[paste0("gene_",i)]] = ggplot(plot_df, aes(x = Var2, y = value, group = Var1))+
      geom_hline(yintercept = 0, linetype = "dashed")+
      geom_line(alpha = 0.1, colour = "grey69")+
      stat_summary(aes(y = value, group=1), fun=mean, colour="red", geom="line",group=1)+
      labs(title = paste0("gene_",i))+
      theme_bw()
  }
  
  pdf(paste0("results/zonation_cond/end_gene_expression_profile_", cond, ".pdf"),
      height = 4.5, width = 4.4)
  print(cowplot::plot_grid(plotlist = plt_list, nrow = 2, ncol = 2, align = "hv"))
  dev.off()
  
  df_pval_cl = merge(data.frame(gene_fit_p), data.frame(cls), by = 0, all = T)
  colnames(df_pval_cl) = c("gene", "pval", "cluster")
  
  write.csv(df_pval_cl, file = paste0("results/zonation_cond/end_pval_clust_", cond, ".csv"),
            col.names = T, row.names = F, quote = F)

  write.csv(gene_fit_vals, 
            file = paste0("results/zonation_cond/end_gene_fits_", cond, ".csv"),
            col.names = T, row.names = F, quote = F)
}
```

Calculate distances for all significant genes

```{r}
for(cond in names(qvals_list)){
  gene_fit_p = qvals_list[[cond]]
  gene_fit_vals = fits_list[[cond]]
  gene_fit_vals_filt = t(gene_fit_vals[,names(gene_fit_p)[gene_fit_p<=0.05]])
  scale_fit = t(apply(gene_fit_vals_filt, 1, scale))
  
  # dist
  if(file.exists(paste0("results/zonation_cond/dtw_distance_", cond, "_sigGenes.RDS"))){
    next
  } else{
    dd = dtw::dtwDist(scale_fit) # TAKES 4.5 HOURS
    saveRDS(dd, paste0("results/zonation_cond/dtw_distance_", cond, "_sigGenes.RDS"))
  }
}
```

Cluster genes (per condition)

```{r}
top_n = 1000
for(cond in names(qvals_list)){
  print(cond)
  gene_fit_p = qvals_list[[cond]]
  gene_fit_vals = fits_list[[cond]]
  
  dd = readRDS(paste0("results/zonation_cond/dtw_distance_", cond, "_sigGenes.RDS"))
  
  gene_fit_vals_filt = t(gene_fit_vals[,names(gene_fit_p)[gene_fit_p<=0.05]])
  scale_fit = t(apply(gene_fit_vals_filt, 1, scale))
  
  # select top genes (0 if using all)
  topgenes = if(top_n==0) names(gene_fit_p) else names(head(gene_fit_p[order(gene_fit_p,
                                                                             decreasing = F)],
                                                            top_n))
  
  clust_genes = hclust(as.dist(dd[topgenes,topgenes]), method = "ward.D2")
  
  cls = cutree(clust_genes, 4)
  #ape::plot.phylo(ape::as.phylo(clust_genes), tip.color = rainbow(10)[cls])
  plt_list = list()
  for(i in unique(cls)){
    sub_df = scale_fit[names(cls)[cls==i],]
    plot_df = reshape2::melt(sub_df)
    plt_list[[paste0("gene_",i)]] = ggplot(plot_df, aes(x = Var2, y = value, group = Var1))+
      geom_hline(yintercept = 0, linetype = "dashed")+
      geom_line(alpha = 0.1, colour = "grey69")+
      stat_summary(aes(y = value, group=1), fun=mean, colour="red", geom="line",group=1)+
      labs(title = paste0("gene_",i))+
      theme_bw()
  }
  
  pdf(paste0("results/zonation_cond/hep_gene_expression_profile_", cond, ".pdf"),
      height = 4.5, width = 4.4)
  print(cowplot::plot_grid(plotlist = plt_list, nrow = 2, ncol = 2, align = "hv"))
  dev.off()
  
  df_pval_cl = merge(data.frame(gene_fit_p), data.frame(cls), by = 0, all = T)
  colnames(df_pval_cl) = c("gene", "pval", "cluster")
  
  write.csv(df_pval_cl, file = paste0("results/zonation_cond/hep_pval_clust_", cond, ".csv"),
            col.names = T, row.names = F, quote = F)

  write.csv(gene_fit_vals, file = paste0("results/zonation_cond/hep_gene_fits_", cond, ".csv"),
            col.names = T, row.names = F, quote = F)
}
```

